initial

2025-02-08 04:54:42 +03:00 · 2025-02-08 04:54:42 +03:00 · 43016168e2
commit 43016168e2
6 changed files with 91 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 /.idea
 /.venv
 /.env
--- a/idb/init.py
+++ b/idb/init.py
--- a/idb/translator.py
+++ b/idb/translator.py
@ -0,0 +1,35 @@
 import tiktoken
 from openai import OpenAI
 model = "gpt-4o"
 system_prompt = "Your translate parts of an article from Russian to English. It contains markdown; leave the markup, links and other formatting intact, translating the actual text."
 input_token_limit = 2000
 def translate(text):
    buf = []
    bufsize = 0
    cl = OpenAI()
    translation = []
    paragraphs = text.split("\n\n")
    for i, paragraph in enumerate(paragraphs):
        buf.append(paragraph)
        bufsize += num_tokens_from_string(paragraph)
        if bufsize >= input_token_limit or i == len(paragraphs)-1:
            chat_completion = cl.chat.completions.create(
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": '\n\n'.join(buf)},
                ],
                model=model,
            )
            translation.append(chat_completion.choices[0].message.content)
            bufsize = 0
            buf = []
    return "\n\n".join(translation)
 def num_tokens_from_string(string, encoding_name="o200k_base"):
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens
--- a/idb/wordpress.py
+++ b/idb/wordpress.py
@ -0,0 +1,22 @@
 import requests
 from bs4 import BeautifulSoup
 from html import unescape
 from markdownify import markdownify
 from collections import namedtuple
 ArticleContent = namedtuple('ArticleContent', ['title', 'html', 'md'])
 def fetch(url) -> ArticleContent:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    flair_element = soup.find(id="jp-post-flair")
    if flair_element:
        flair_element.decompose()
    html = str(soup.find("div", class_="entry-content")).strip()
    title = unescape(soup.find(class_="entry-title").get_text(strip=True))
    md = markdownify(html).strip()
    return ArticleContent(title, html, md)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,6 @@
 beautifulsoup4~=4.13.3
 markdownify~=0.14.1
 requests~=2.32.3
 openai~=1.61.1
 python-dotenv~=1.0.1
 tiktoken~=0.8.0
--- a/single_article.py
+++ b/single_article.py
@ -0,0 +1,25 @@
 #!/usr/bin/env python3
 from argparse import ArgumentParser
 from idb.wordpress import fetch
 from idb.translator import translate
 from dotenv import load_dotenv
 load_dotenv()
 def save(file, content):
    with open(file, 'w') as f:
        f.write(content)
 if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument('-url', '--url', type=str, required=True, help='article link')
    parser.add_argument('--output', nargs=2, metavar=('FILE_RU', 'FILE_EN'),
                        help="output files")
    args = parser.parse_args()
    article = fetch(args.url)
    translation = translate(article.md)
    save(args.output[0], article.md)
    save(args.output[1], translation)