idb_utils/idb/translator.py
2025-02-09 04:17:39 +03:00

36 lines
1.2 KiB
Python

import tiktoken
from openai import OpenAI
model = "gpt-4o"
system_prompt = "You translate parts of an article from Russian to English. It contains markdown; leave the markup, links and other formatting intact, translating the actual text. Also don't translate citations."
input_token_limit = 2000
def translate(text):
buf = []
bufsize = 0
cl = OpenAI()
translation = []
paragraphs = text.split("\n\n")
for i, paragraph in enumerate(paragraphs):
buf.append(paragraph)
bufsize += num_tokens_from_string(paragraph)
if bufsize >= input_token_limit or i == len(paragraphs)-1:
chat_completion = cl.chat.completions.create(
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": '\n\n'.join(buf)},
],
model=model,
)
translation.append(chat_completion.choices[0].message.content)
bufsize = 0
buf = []
return "\n\n".join(translation)
def num_tokens_from_string(string, encoding_name="o200k_base"):
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens