Download html and turn it to Markdown
import argparse
import requests
import html2text
def download_html_to_markdown(url, output_file):
# Fetch the HTML content
response = requests.get(url)
response.raise_for_status()
# Convert HTML to Markdown
h2t = html2text.HTML2Text()
h2t.ignore_links = False
markdown_content = h2t.handle(response.text)
# Write Markdown content to output file
with open(output_file, 'w', encoding='utf-8') as file:
file.write(markdown_content)
def main():
parser = argparse.ArgumentParser(description='Download an HTML page and convert it to Markdown format')
parser.add_argument('url', type=str, help='The URL of the HTML page to download and convert')
parser.add_argument('output_file', type=str, help='The path of the output Markdown file')
args = parser.parse_args()
download_html_to_markdown(args.url, args.output_file)
if __name__ == '__main__':
main()
Use GPT-3 to translate
import argparse
import openai
import os
def read_file_paragraphs(file_path, max_tokens=2048):
with open(file_path, 'r', encoding='utf-8') as file:
paragraph = []
tokens = 0
for line in file:
line_tokens = len(line.split())
if line.strip() == "":
if paragraph:
yield "".join(paragraph)
paragraph = []
tokens = 0
else:
if tokens + line_tokens <= max_tokens:
paragraph.append(line)
tokens += line_tokens
else:
yield "".join(paragraph)
paragraph = [line]
tokens = line_tokens
if paragraph:
yield "".join(paragraph)
def translate_to_chinese(text):
openai.api_key = os.getenv("OPENAI_API_KEY")
response = openai.Completion.create(
engine="text-davinci-003",
prompt=f"Translate the following English text to Chinese and keep the markdown format if exists:\n\n{text}\n",
temperature=0.8,
max_tokens=2048,
top_p=1,
frequency_penalty=0,
presence_penalty=0
)
translated_text = response.choices[0].text.strip()
return translated_text
def main():
parser = argparse.ArgumentParser(description='Split a Markdown file into paragraphs and translate each paragraph to Chinese using GPT-3 OpenAI API')
parser.add_argument('input_file', type=str, help='The path of the input Markdown file')
parser.add_argument('output_file', type=str, help='The path of the output Markdown file in Chinese')
args = parser.parse_args()
translated_paragraphs = []
for paragraph in read_file_paragraphs(args.input_file):
#translated_paragraphs.append(translate_to_chinese(paragraph))
print('<<<<<<<<<<<<<')
print(paragraph)
chinese_version = translate_to_chinese(paragraph)
print('>>>>>>>>>>>>>')
print(chinese_version)
translated_paragraphs.append(chinese_version)
print('-------------')
with open(args.output_file, 'w', encoding='utf-8') as output_file:
output_file.write('\n'.join(translated_paragraphs))
if __name__ == '__main__':
main()