import requests
from bs4 import BeautifulSoup
import re
import os
# 获取用户输入
page_name = input("请输入维基百科页面名称(例如 'Botswana'):")
# 构建 URL 和文件路径
url = f'https://en.wikipedia.org/wiki/{page_name}'
file_name = f'{page_name}.md'
file_path = os.path.join("S:\\temp\\vk", file_name)
# 自定义请求头(如果需要)
custom_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# 发送 HTTP GET 请求,并包含自定义请求头
response = requests.get(url, headers=custom_headers)
# 创建 BeautifulSoup 对象
soup = BeautifulSoup(response.text, 'html.parser')
# 正则表达式模式,用于匹配类似 '[1]' 的标记
pattern = r'\[\d+\]'
# 定位到 #bodyContent 元素
body_content = soup.select_one('#bodyContent')
# 创建一个列表,用于存储按顺序提取的文本
extracted_texts = []
# 如果找到 #bodyContent 元素
if body_content:
# 遍历 body_content 的所有子元素,保持顺序
for element in body_content.descendants:
if element.name == 'h2':
text = element.get_text()
cleaned_text = re.sub(pattern, '', text)
extracted_texts.append(f"## {cleaned_text.strip()}")
elif element.name == 'p':
text = element.get_text()
cleaned_text = re.sub(pattern, '', text)
extracted_texts.append(f"{cleaned_text.strip()}")
elif element.name == 'h3': # 继续提取
text = element.get_text()
cleaned_text = re.sub(pattern, '', text)
extracted_texts.append(f"### {cleaned_text.strip()}")
# 将提取的文本写入文件
with open(file_path, 'w', encoding='utf-8') as file:
for text in extracted_texts:
file.write(text + '\n')
print(f"提取的内容已保存到 {file_path}")
else:
print("未找到 #bodyContent 元素")