idb_utils/idb/translator.py
2025-05-17 04:28:55 +03:00

51 lines
1.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import tiktoken
from openai import OpenAI
model = "gpt-4o"
system_prompt = """
Translate the following text from Russian to English while strictly preserving the markup, and also following elements in their original form:
- Quotes (e.g., > quoted text). Can be multi-line.
- Links (e.g., [text](url))
- Images (e.g., ![alt text](image_url))
Do not modify or translate these elements, leave them exactly as they appear in the original text. Only translate the surrounding content.
"""
input_token_limit = 5000
def translate_markdown(text):
buf = []
bufsize = 0
cl = OpenAI()
translation = []
paragraphs = text.split("\n\n")
for i, paragraph in enumerate(paragraphs):
buf.append(paragraph)
bufsize += num_tokens_from_string(paragraph)
if bufsize >= input_token_limit or i == len(paragraphs)-1:
chat_completion = cl.chat.completions.create(
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": '\n\n'.join(buf)},
],
model=model,
)
translation.append(chat_completion.choices[0].message.content)
bufsize = 0
buf = []
replacements = (
('', '"'),
('', '"'),
("", "'")
)
translation = "\n\n".join(translation)
for r in replacements:
translation = translation.replace(r[0], r[1])
return translation
def num_tokens_from_string(string, encoding_name="o200k_base"):
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens