import os
import time
import fitz # PyMuPDF
from deep_translator import GoogleTranslator
from requests.exceptions import RequestException
MAX_CHUNK_SIZE = 5000 # Maximum number of characters per chunk
RETRY_COUNT = 3 # Number of times to retry the request
# Define the source and target paths
source_path = r'S:\temp\vk\English'
translate_path = r'S:\temp\vk\cn'
def split_text(text, max_chunk_size):
"""Split text into chunks with a maximum size."""
chunks = []
while len(text) > max_chunk_size:
# Find the split point that does not cut off words
split_point = text.rfind(' ', 0, max_chunk_size)
if split_point == -1: # No space found, just split at max_chunk_size
split_point = max_chunk_size
chunks.append(text[:split_point])
text = text[split_point:].lstrip()
chunks.append(text) # Append the remaining text
return chunks
def translate_text(text, dest_language='zh-CN'):
"""Translate text using GoogleTranslator and handle chunked text."""
translator = GoogleTranslator(source='auto', target=dest_language)
chunks = split_text(text, MAX_CHUNK_SIZE)
translated_text = ''
for chunk in chunks:
attempt = 0
while attempt < RETRY_COUNT:
try:
translation = translator.translate(chunk)
translated_text += translation
break
except RequestException as e:
print(f"翻译错误: {e}. 正在尝试重新连接...({attempt+1}/{RETRY_COUNT})")
attempt += 1
time.sleep(2) # Wait before retrying
else:
print(f"翻译失败,返回原文本")
translated_text += chunk # If all retries fail, return the original text
return translated_text
def check_pdf_validity(file_path):
"""Check if the PDF file is valid and can be opened."""
try:
pdf_document = fitz.open(file_path)
pdf_document.close()
print(f"文件 {file_path} 是有效的PDF。")
return True
except Exception as e:
print(f"文件 {file_path} 不是有效的PDF或已损坏: {e}")
return False
def process_pdf_file(file_path):
if not check_pdf_validity(file_path):
return # Skip this file if it is not a valid PDF
try:
pdf_document = fitz.open(file_path)
full_text = ""
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
text = page.get_text()
if not text.strip(): # If the page is empty or contains non-text content
print(f"页面 {page_num} 中没有提取到文本")
continue
full_text += text
pdf_document.close()
if not full_text.strip():
print(f"文件 {file_path} 中没有提取到文本")
return
# Print extracted text for debugging
print("\n[翻译前文本]:")
print(full_text[:2000]) # Print the first 2000 characters for brevity
# 分段处理
paragraphs = full_text.split('\n\n')
translated_paragraphs = [translate_text(p) for p in paragraphs]
translated_content = '\n\n'.join(translated_paragraphs)
# Print translated text for debugging
# print("\n[翻译后文本]:")
# print(translated_content[:2000]) # Print the first 2000 characters for brevity
# Save the translated content to the target directory
new_file_path = os.path.join(translate_path, os.path.basename(file_path).replace('.pdf', '_cn.md'))
with open(new_file_path, 'w', encoding='utf-8') as file:
file.write(translated_content)
print(f"文件 {file_path} 处理完成,保存为 {new_file_path}")
except Exception as e:
print(f"处理文件 {file_path} 时发生错误: {e}")
def process_md_file(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Print original content for debugging
print("\n[翻译前文本]:")
print(content[:2000]) # Print the first 2000 characters for brevity
# 分段处理
paragraphs = content.split('\n\n')
translated_paragraphs = [translate_text(p) for p in paragraphs]
translated_content = '\n\n'.join(translated_paragraphs)
# Print translated content for debugging
print("\n[翻译后文本]:")
print(translated_content[:2000]) # Print the first 2000 characters for brevity
# Save the translated content to the target directory
new_file_path = os.path.join(translate_path, os.path.basename(file_path).replace('.md', '_cn.md'))
with open(new_file_path, 'w', encoding='utf-8') as file:
file.write(translated_content)
print(f"文件 {file_path} 处理完成,保存为 {new_file_path}")
except Exception as e:
print(f"处理文件 {file_path} 时发生错误: {e}")
def process_txt_file(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Print original content for debugging
print("\n[翻译前文本]:")
print(content[:2000]) # Print the first 2000 characters for brevity
# 分段处理
paragraphs = content.split('\n\n')
translated_paragraphs = [translate_text(p) for p in paragraphs]
translated_content = '\n\n'.join(translated_paragraphs)
# Print translated content for debugging
print("\n[翻译后文本]:")
print(translated_content[:2000]) # Print the first 2000 characters for brevity
# Save the translated content to the target directory
new_file_path = os.path.join(translate_path, os.path.basename(file_path).replace('.txt', '_cn.txt'))
with open(new_file_path, 'w', encoding='utf-8') as file:
file.write(translated_content)
print(f"文件 {file_path} 处理完成,保存为 {new_file_path}")
except Exception as e:
print(f"处理文件 {file_path} 时发生错误: {e}")
def translate_files_in_directory(source_path, translate_path):
if not os.path.exists(translate_path):
os.makedirs(translate_path)
for root, _, files in os.walk(source_path):
for file_name in files:
file_path = os.path.join(root, file_name)
# Check if the file has already been translated
if file_name.endswith('_cn.md') or file_name.endswith('_cn.txt') or file_name.endswith('_cn.pdf'):
print(f"跳过已经翻译的文件: {file_path}")
continue
if file_name.endswith('.md'):
process_md_file(file_path)
elif file_name.endswith('.txt'):
process_txt_file(file_path)
elif file_name.endswith('.pdf'):
process_pdf_file(file_path)
# 可以扩展其他格式的处理
# 设定需要处理的文件夹路径
translate_files_in_directory(source_path, translate_path)