gpt translator.py

Download html and turn it to Markdown

import argparse
import requests
import html2text

def download_html_to_markdown(url, output_file):
    # Fetch the HTML content
    response = requests.get(url)
    response.raise_for_status()

    # Convert HTML to Markdown
    h2t = html2text.HTML2Text()
    h2t.ignore_links = False
    markdown_content = h2t.handle(response.text)

    # Write Markdown content to output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(markdown_content)

def main():
    parser = argparse.ArgumentParser(description='Download an HTML page and convert it to Markdown format')
    parser.add_argument('url', type=str, help='The URL of the HTML page to download and convert')
    parser.add_argument('output_file', type=str, help='The path of the output Markdown file')
    args = parser.parse_args()

    download_html_to_markdown(args.url, args.output_file)

if __name__ == '__main__':
    main()

Use GPT-3 to translate

import argparse
import openai
import os

def read_file_paragraphs(file_path, max_tokens=2048):
    with open(file_path, 'r', encoding='utf-8') as file:
        paragraph = []
        tokens = 0
        for line in file:
            line_tokens = len(line.split())
            if line.strip() == "":
                if paragraph:
                    yield "".join(paragraph)
                    paragraph = []
                    tokens = 0
            else:
                if tokens + line_tokens <= max_tokens:
                    paragraph.append(line)
                    tokens += line_tokens
                else:
                    yield "".join(paragraph)
                    paragraph = [line]
                    tokens = line_tokens
        if paragraph:
            yield "".join(paragraph)

def translate_to_chinese(text):
    openai.api_key = os.getenv("OPENAI_API_KEY")
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=f"Translate the following English text to Chinese and keep the markdown format if exists:\n\n{text}\n",
        temperature=0.8,
        max_tokens=2048,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    translated_text = response.choices[0].text.strip()
    return translated_text

def main():
    parser = argparse.ArgumentParser(description='Split a Markdown file into paragraphs and translate each paragraph to Chinese using GPT-3 OpenAI API')
    parser.add_argument('input_file', type=str, help='The path of the input Markdown file')
    parser.add_argument('output_file', type=str, help='The path of the output Markdown file in Chinese')
    args = parser.parse_args()

    translated_paragraphs = []

    for paragraph in read_file_paragraphs(args.input_file):
        #translated_paragraphs.append(translate_to_chinese(paragraph))
        print('<<<<<<<<<<<<<')
        print(paragraph)
        chinese_version = translate_to_chinese(paragraph)
        print('>>>>>>>>>>>>>')
        print(chinese_version)
        translated_paragraphs.append(chinese_version)
        print('-------------')

    with open(args.output_file, 'w', encoding='utf-8') as output_file:
        output_file.write('\n'.join(translated_paragraphs))

if __name__ == '__main__':
    main()