From 43016168e21d66250695b7f201d0a8d5551b658c Mon Sep 17 00:00:00 2001 From: "E. S" Date: Sat, 8 Feb 2025 04:54:42 +0300 Subject: [PATCH] initial --- .gitignore | 3 +++ idb/__init__.py | 0 idb/translator.py | 35 +++++++++++++++++++++++++++++++++++ idb/wordpress.py | 22 ++++++++++++++++++++++ requirements.txt | 6 ++++++ single_article.py | 25 +++++++++++++++++++++++++ 6 files changed, 91 insertions(+) create mode 100644 .gitignore create mode 100644 idb/__init__.py create mode 100644 idb/translator.py create mode 100644 idb/wordpress.py create mode 100644 requirements.txt create mode 100755 single_article.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..342783d --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/.idea +/.venv +/.env \ No newline at end of file diff --git a/idb/__init__.py b/idb/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/idb/translator.py b/idb/translator.py new file mode 100644 index 0000000..2a78529 --- /dev/null +++ b/idb/translator.py @@ -0,0 +1,35 @@ +import tiktoken +from openai import OpenAI + +model = "gpt-4o" +system_prompt = "Your translate parts of an article from Russian to English. It contains markdown; leave the markup, links and other formatting intact, translating the actual text." +input_token_limit = 2000 + + +def translate(text): + buf = [] + bufsize = 0 + cl = OpenAI() + translation = [] + paragraphs = text.split("\n\n") + for i, paragraph in enumerate(paragraphs): + buf.append(paragraph) + bufsize += num_tokens_from_string(paragraph) + if bufsize >= input_token_limit or i == len(paragraphs)-1: + chat_completion = cl.chat.completions.create( + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": '\n\n'.join(buf)}, + ], + model=model, + ) + translation.append(chat_completion.choices[0].message.content) + bufsize = 0 + buf = [] + return "\n\n".join(translation) + + +def num_tokens_from_string(string, encoding_name="o200k_base"): + encoding = tiktoken.get_encoding(encoding_name) + num_tokens = len(encoding.encode(string)) + return num_tokens diff --git a/idb/wordpress.py b/idb/wordpress.py new file mode 100644 index 0000000..10559f9 --- /dev/null +++ b/idb/wordpress.py @@ -0,0 +1,22 @@ +import requests +from bs4 import BeautifulSoup +from html import unescape +from markdownify import markdownify +from collections import namedtuple + +ArticleContent = namedtuple('ArticleContent', ['title', 'html', 'md']) + + +def fetch(url) -> ArticleContent: + response = requests.get(url) + soup = BeautifulSoup(response.text, 'html.parser') + + flair_element = soup.find(id="jp-post-flair") + if flair_element: + flair_element.decompose() + + html = str(soup.find("div", class_="entry-content")).strip() + title = unescape(soup.find(class_="entry-title").get_text(strip=True)) + md = markdownify(html).strip() + + return ArticleContent(title, html, md) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6ffb267 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +beautifulsoup4~=4.13.3 +markdownify~=0.14.1 +requests~=2.32.3 +openai~=1.61.1 +python-dotenv~=1.0.1 +tiktoken~=0.8.0 \ No newline at end of file diff --git a/single_article.py b/single_article.py new file mode 100755 index 0000000..1c3e3bf --- /dev/null +++ b/single_article.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +from argparse import ArgumentParser +from idb.wordpress import fetch +from idb.translator import translate +from dotenv import load_dotenv + +load_dotenv() + +def save(file, content): + with open(file, 'w') as f: + f.write(content) + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('-url', '--url', type=str, required=True, help='article link') + parser.add_argument('--output', nargs=2, metavar=('FILE_RU', 'FILE_EN'), + help="output files") + args = parser.parse_args() + + article = fetch(args.url) + translation = translate(article.md) + + save(args.output[0], article.md) + save(args.output[1], translation)