Google翻译文件


import os
import time
import fitz  # PyMuPDF
from deep_translator import GoogleTranslator
from requests.exceptions import RequestException

MAX_CHUNK_SIZE = 5000  # Maximum number of characters per chunk
RETRY_COUNT = 3  # Number of times to retry the request

# Define the source and target paths
source_path = r'S:\temp\vk\English'
translate_path = r'S:\temp\vk\cn'

def split_text(text, max_chunk_size):
    """Split text into chunks with a maximum size."""
    chunks = []
    while len(text) > max_chunk_size:
        # Find the split point that does not cut off words
        split_point = text.rfind(' ', 0, max_chunk_size)
        if split_point == -1:  # No space found, just split at max_chunk_size
            split_point = max_chunk_size
        chunks.append(text[:split_point])
        text = text[split_point:].lstrip()
    chunks.append(text)  # Append the remaining text
    return chunks

def translate_text(text, dest_language='zh-CN'):
    """Translate text using GoogleTranslator and handle chunked text."""
    translator = GoogleTranslator(source='auto', target=dest_language)
    chunks = split_text(text, MAX_CHUNK_SIZE)
    translated_text = ''

    for chunk in chunks:
        attempt = 0
        while attempt < RETRY_COUNT:
            try:
                translation = translator.translate(chunk)
                translated_text += translation
                break
            except RequestException as e:
                print(f"翻译错误: {e}. 正在尝试重新连接...({attempt+1}/{RETRY_COUNT})")
                attempt += 1
                time.sleep(2)  # Wait before retrying
        else:
            print(f"翻译失败,返回原文本")
            translated_text += chunk  # If all retries fail, return the original text

    return translated_text

def check_pdf_validity(file_path):
    """Check if the PDF file is valid and can be opened."""
    try:
        pdf_document = fitz.open(file_path)
        pdf_document.close()
        print(f"文件 {file_path} 是有效的PDF。")
        return True
    except Exception as e:
        print(f"文件 {file_path} 不是有效的PDF或已损坏: {e}")
        return False

def process_pdf_file(file_path):
    if not check_pdf_validity(file_path):
        return  # Skip this file if it is not a valid PDF

    try:
        pdf_document = fitz.open(file_path)
        full_text = ""
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text = page.get_text()
            if not text.strip():  # If the page is empty or contains non-text content
                print(f"页面 {page_num} 中没有提取到文本")
                continue
            full_text += text

        pdf_document.close()

        if not full_text.strip():
            print(f"文件 {file_path} 中没有提取到文本")
            return

        # Print extracted text for debugging
        print("\n[翻译前文本]:")
        print(full_text[:2000])  # Print the first 2000 characters for brevity

        # 分段处理
        paragraphs = full_text.split('\n\n')
        translated_paragraphs = [translate_text(p) for p in paragraphs]
        translated_content = '\n\n'.join(translated_paragraphs)

        # Print translated text for debugging
        # print("\n[翻译后文本]:")
        # print(translated_content[:2000])  # Print the first 2000 characters for brevity

        # Save the translated content to the target directory
        new_file_path = os.path.join(translate_path, os.path.basename(file_path).replace('.pdf', '_cn.md'))
        with open(new_file_path, 'w', encoding='utf-8') as file:
            file.write(translated_content)
        print(f"文件 {file_path} 处理完成,保存为 {new_file_path}")
    except Exception as e:
        print(f"处理文件 {file_path} 时发生错误: {e}")

def process_md_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # Print original content for debugging
        print("\n[翻译前文本]:")
        print(content[:2000])  # Print the first 2000 characters for brevity

        # 分段处理
        paragraphs = content.split('\n\n')
        translated_paragraphs = [translate_text(p) for p in paragraphs]
        translated_content = '\n\n'.join(translated_paragraphs)

        # Print translated content for debugging
        print("\n[翻译后文本]:")
        print(translated_content[:2000])  # Print the first 2000 characters for brevity

        # Save the translated content to the target directory
        new_file_path = os.path.join(translate_path, os.path.basename(file_path).replace('.md', '_cn.md'))
        with open(new_file_path, 'w', encoding='utf-8') as file:
            file.write(translated_content)
        print(f"文件 {file_path} 处理完成,保存为 {new_file_path}")
    except Exception as e:
        print(f"处理文件 {file_path} 时发生错误: {e}")

def process_txt_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # Print original content for debugging
        print("\n[翻译前文本]:")
        print(content[:2000])  # Print the first 2000 characters for brevity

        # 分段处理
        paragraphs = content.split('\n\n')
        translated_paragraphs = [translate_text(p) for p in paragraphs]
        translated_content = '\n\n'.join(translated_paragraphs)

        # Print translated content for debugging
        print("\n[翻译后文本]:")
        print(translated_content[:2000])  # Print the first 2000 characters for brevity

        # Save the translated content to the target directory
        new_file_path = os.path.join(translate_path, os.path.basename(file_path).replace('.txt', '_cn.txt'))
        with open(new_file_path, 'w', encoding='utf-8') as file:
            file.write(translated_content)
        print(f"文件 {file_path} 处理完成,保存为 {new_file_path}")
    except Exception as e:
        print(f"处理文件 {file_path} 时发生错误: {e}")

def translate_files_in_directory(source_path, translate_path):
    if not os.path.exists(translate_path):
        os.makedirs(translate_path)

    for root, _, files in os.walk(source_path):
        for file_name in files:
            file_path = os.path.join(root, file_name)

            # Check if the file has already been translated
            if file_name.endswith('_cn.md') or file_name.endswith('_cn.txt') or file_name.endswith('_cn.pdf'):
                print(f"跳过已经翻译的文件: {file_path}")
                continue

            if file_name.endswith('.md'):
                process_md_file(file_path)
            elif file_name.endswith('.txt'):
                process_txt_file(file_path)
            elif file_name.endswith('.pdf'):
                process_pdf_file(file_path)
            # 可以扩展其他格式的处理

# 设定需要处理的文件夹路径
translate_files_in_directory(source_path, translate_path)

留下评论

您的邮箱地址不会被公开。 必填项已用 * 标注