idb_utils/idb/translator.py

import tiktoken
from openai import OpenAI

model = "gpt-4o"
system_prompt = """
Translate the following text from Russian to English while strictly preserving the markup, and also following elements in their original form:
- Quotes (e.g., > quoted text). Can be multi-line.
- Links (e.g., [text](url))
- Images (e.g., ![alt text](image_url))

Do not modify or translate these elements, leave them exactly as they appear in the original text. Only translate the surrounding content.
"""
input_token_limit = 5000


def translate_markdown(text):
    buf = []
    bufsize = 0
    cl = OpenAI()
    translation = []
    paragraphs = text.split("\n\n")
    for i, paragraph in enumerate(paragraphs):
        buf.append(paragraph)
        bufsize += num_tokens_from_string(paragraph)
        if bufsize >= input_token_limit or i == len(paragraphs)-1:
            chat_completion = cl.chat.completions.create(
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": '\n\n'.join(buf)},
                ],
                model=model,
            )
            translation.append(chat_completion.choices[0].message.content)
            bufsize = 0
            buf = []
    replacements = (
        ('”', '"'),
        ('“', '"'),
        ("’", "'")
    )
    translation = "\n\n".join(translation)
    for r in replacements:
        translation = translation.replace(r[0], r[1])
    return translation


def num_tokens_from_string(string, encoding_name="o200k_base"):
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens