From 43016168e21d66250695b7f201d0a8d5551b658c Mon Sep 17 00:00:00 2001
From: "E. S" <the@rootless.one>
Date: Sat, 8 Feb 2025 04:54:42 +0300
Subject: [PATCH] initial

---
 .gitignore        |  3 +++
 idb/__init__.py   |  0
 idb/translator.py | 35 +++++++++++++++++++++++++++++++++++
 idb/wordpress.py  | 22 ++++++++++++++++++++++
 requirements.txt  |  6 ++++++
 single_article.py | 25 +++++++++++++++++++++++++
 6 files changed, 91 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 idb/__init__.py
 create mode 100644 idb/translator.py
 create mode 100644 idb/wordpress.py
 create mode 100644 requirements.txt
 create mode 100755 single_article.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..342783d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+/.idea
+/.venv
+/.env
\ No newline at end of file
diff --git a/idb/__init__.py b/idb/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/idb/translator.py b/idb/translator.py
new file mode 100644
index 0000000..2a78529
--- /dev/null
+++ b/idb/translator.py
@@ -0,0 +1,35 @@
+import tiktoken
+from openai import OpenAI
+
+model = "gpt-4o"
+system_prompt = "Your translate parts of an article from Russian to English. It contains markdown; leave the markup, links and other formatting intact, translating the actual text."
+input_token_limit = 2000
+
+
+def translate(text):
+    buf = []
+    bufsize = 0
+    cl = OpenAI()
+    translation = []
+    paragraphs = text.split("\n\n")
+    for i, paragraph in enumerate(paragraphs):
+        buf.append(paragraph)
+        bufsize += num_tokens_from_string(paragraph)
+        if bufsize >= input_token_limit or i == len(paragraphs)-1:
+            chat_completion = cl.chat.completions.create(
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": '\n\n'.join(buf)},
+                ],
+                model=model,
+            )
+            translation.append(chat_completion.choices[0].message.content)
+            bufsize = 0
+            buf = []
+    return "\n\n".join(translation)
+
+
+def num_tokens_from_string(string, encoding_name="o200k_base"):
+    encoding = tiktoken.get_encoding(encoding_name)
+    num_tokens = len(encoding.encode(string))
+    return num_tokens
diff --git a/idb/wordpress.py b/idb/wordpress.py
new file mode 100644
index 0000000..10559f9
--- /dev/null
+++ b/idb/wordpress.py
@@ -0,0 +1,22 @@
+import requests
+from bs4 import BeautifulSoup
+from html import unescape
+from markdownify import markdownify
+from collections import namedtuple
+
+ArticleContent = namedtuple('ArticleContent', ['title', 'html', 'md'])
+
+
+def fetch(url) -> ArticleContent:
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+    flair_element = soup.find(id="jp-post-flair")
+    if flair_element:
+        flair_element.decompose()
+
+    html = str(soup.find("div", class_="entry-content")).strip()
+    title = unescape(soup.find(class_="entry-title").get_text(strip=True))
+    md = markdownify(html).strip()
+
+    return ArticleContent(title, html, md)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..6ffb267
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+beautifulsoup4~=4.13.3
+markdownify~=0.14.1
+requests~=2.32.3
+openai~=1.61.1
+python-dotenv~=1.0.1
+tiktoken~=0.8.0
\ No newline at end of file
diff --git a/single_article.py b/single_article.py
new file mode 100755
index 0000000..1c3e3bf
--- /dev/null
+++ b/single_article.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+from argparse import ArgumentParser
+from idb.wordpress import fetch
+from idb.translator import translate
+from dotenv import load_dotenv
+
+load_dotenv()
+
+def save(file, content):
+    with open(file, 'w') as f:
+        f.write(content)
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('-url', '--url', type=str, required=True, help='article link')
+    parser.add_argument('--output', nargs=2, metavar=('FILE_RU', 'FILE_EN'),
+                        help="output files")
+    args = parser.parse_args()
+
+    article = fetch(args.url)
+    translation = translate(article.md)
+
+    save(args.output[0], article.md)
+    save(args.output[1], translation)