initial
This commit is contained in:
commit
43016168e2
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
/.idea
|
||||||
|
/.venv
|
||||||
|
/.env
|
0
idb/__init__.py
Normal file
0
idb/__init__.py
Normal file
35
idb/translator.py
Normal file
35
idb/translator.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
import tiktoken
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
model = "gpt-4o"
|
||||||
|
system_prompt = "Your translate parts of an article from Russian to English. It contains markdown; leave the markup, links and other formatting intact, translating the actual text."
|
||||||
|
input_token_limit = 2000
|
||||||
|
|
||||||
|
|
||||||
|
def translate(text):
|
||||||
|
buf = []
|
||||||
|
bufsize = 0
|
||||||
|
cl = OpenAI()
|
||||||
|
translation = []
|
||||||
|
paragraphs = text.split("\n\n")
|
||||||
|
for i, paragraph in enumerate(paragraphs):
|
||||||
|
buf.append(paragraph)
|
||||||
|
bufsize += num_tokens_from_string(paragraph)
|
||||||
|
if bufsize >= input_token_limit or i == len(paragraphs)-1:
|
||||||
|
chat_completion = cl.chat.completions.create(
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": system_prompt},
|
||||||
|
{"role": "user", "content": '\n\n'.join(buf)},
|
||||||
|
],
|
||||||
|
model=model,
|
||||||
|
)
|
||||||
|
translation.append(chat_completion.choices[0].message.content)
|
||||||
|
bufsize = 0
|
||||||
|
buf = []
|
||||||
|
return "\n\n".join(translation)
|
||||||
|
|
||||||
|
|
||||||
|
def num_tokens_from_string(string, encoding_name="o200k_base"):
|
||||||
|
encoding = tiktoken.get_encoding(encoding_name)
|
||||||
|
num_tokens = len(encoding.encode(string))
|
||||||
|
return num_tokens
|
22
idb/wordpress.py
Normal file
22
idb/wordpress.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from html import unescape
|
||||||
|
from markdownify import markdownify
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
|
ArticleContent = namedtuple('ArticleContent', ['title', 'html', 'md'])
|
||||||
|
|
||||||
|
|
||||||
|
def fetch(url) -> ArticleContent:
|
||||||
|
response = requests.get(url)
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
flair_element = soup.find(id="jp-post-flair")
|
||||||
|
if flair_element:
|
||||||
|
flair_element.decompose()
|
||||||
|
|
||||||
|
html = str(soup.find("div", class_="entry-content")).strip()
|
||||||
|
title = unescape(soup.find(class_="entry-title").get_text(strip=True))
|
||||||
|
md = markdownify(html).strip()
|
||||||
|
|
||||||
|
return ArticleContent(title, html, md)
|
6
requirements.txt
Normal file
6
requirements.txt
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
beautifulsoup4~=4.13.3
|
||||||
|
markdownify~=0.14.1
|
||||||
|
requests~=2.32.3
|
||||||
|
openai~=1.61.1
|
||||||
|
python-dotenv~=1.0.1
|
||||||
|
tiktoken~=0.8.0
|
25
single_article.py
Executable file
25
single_article.py
Executable file
@ -0,0 +1,25 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from idb.wordpress import fetch
|
||||||
|
from idb.translator import translate
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
def save(file, content):
|
||||||
|
with open(file, 'w') as f:
|
||||||
|
f.write(content)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = ArgumentParser()
|
||||||
|
parser.add_argument('-url', '--url', type=str, required=True, help='article link')
|
||||||
|
parser.add_argument('--output', nargs=2, metavar=('FILE_RU', 'FILE_EN'),
|
||||||
|
help="output files")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
article = fetch(args.url)
|
||||||
|
translation = translate(article.md)
|
||||||
|
|
||||||
|
save(args.output[0], article.md)
|
||||||
|
save(args.output[1], translation)
|
Loading…
x
Reference in New Issue
Block a user