36 lines
1.2 KiB
Python
36 lines
1.2 KiB
Python
import tiktoken
|
|
from openai import OpenAI
|
|
|
|
model = "gpt-4o"
|
|
system_prompt = "You translate parts of an article from Russian to English. It contains markdown; leave the markup, links and other formatting intact, translating the actual text. Also don't translate citations."
|
|
input_token_limit = 2000
|
|
|
|
|
|
def translate(text):
|
|
buf = []
|
|
bufsize = 0
|
|
cl = OpenAI()
|
|
translation = []
|
|
paragraphs = text.split("\n\n")
|
|
for i, paragraph in enumerate(paragraphs):
|
|
buf.append(paragraph)
|
|
bufsize += num_tokens_from_string(paragraph)
|
|
if bufsize >= input_token_limit or i == len(paragraphs)-1:
|
|
chat_completion = cl.chat.completions.create(
|
|
messages=[
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": '\n\n'.join(buf)},
|
|
],
|
|
model=model,
|
|
)
|
|
translation.append(chat_completion.choices[0].message.content)
|
|
bufsize = 0
|
|
buf = []
|
|
return "\n\n".join(translation)
|
|
|
|
|
|
def num_tokens_from_string(string, encoding_name="o200k_base"):
|
|
encoding = tiktoken.get_encoding(encoding_name)
|
|
num_tokens = len(encoding.encode(string))
|
|
return num_tokens
|