51 lines
1.6 KiB
Python
51 lines
1.6 KiB
Python
import tiktoken
|
||
from openai import OpenAI
|
||
|
||
model = "gpt-4o"
|
||
system_prompt = """
|
||
Translate the following text from Russian to English while strictly preserving the markup, and also following elements in their original form:
|
||
- Quotes (e.g., > quoted text). Can be multi-line.
|
||
- Links (e.g., [text](url))
|
||
- Images (e.g., )
|
||
|
||
Do not modify or translate these elements, leave them exactly as they appear in the original text. Only translate the surrounding content.
|
||
"""
|
||
input_token_limit = 5000
|
||
|
||
|
||
def translate_markdown(text):
|
||
buf = []
|
||
bufsize = 0
|
||
cl = OpenAI()
|
||
translation = []
|
||
paragraphs = text.split("\n\n")
|
||
for i, paragraph in enumerate(paragraphs):
|
||
buf.append(paragraph)
|
||
bufsize += num_tokens_from_string(paragraph)
|
||
if bufsize >= input_token_limit or i == len(paragraphs)-1:
|
||
chat_completion = cl.chat.completions.create(
|
||
messages=[
|
||
{"role": "system", "content": system_prompt},
|
||
{"role": "user", "content": '\n\n'.join(buf)},
|
||
],
|
||
model=model,
|
||
)
|
||
translation.append(chat_completion.choices[0].message.content)
|
||
bufsize = 0
|
||
buf = []
|
||
replacements = (
|
||
('”', '"'),
|
||
('“', '"'),
|
||
("’", "'")
|
||
)
|
||
translation = "\n\n".join(translation)
|
||
for r in replacements:
|
||
translation = translation.replace(r[0], r[1])
|
||
return translation
|
||
|
||
|
||
def num_tokens_from_string(string, encoding_name="o200k_base"):
|
||
encoding = tiktoken.get_encoding(encoding_name)
|
||
num_tokens = len(encoding.encode(string))
|
||
return num_tokens
|