`from bs4 import BeautifulSoup
import os
def extract_paragraphs_with_links(html_path, output_dir):
确保输出目录存在
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 读取HTML文件
with open(html_path, 'r', encoding='utf-8') as file:
contents = file.read()
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(contents, 'lxml')
# 提取包含标签的所有标签
paragraphs = []
for p in soup.find_all('p'):
if p.find('a'): # 检查标签内是否包含标签
paragraphs.append(p)
# 将结果写入新文件
output_path = os.path.join(output_dir, 'extracted_paragraphs.txt')
with open(output_path, 'w', encoding='utf-8') as file:
for p in paragraphs:
# 将标签内的所有内容转换为字符串,同时提取标签内的文本
text = p.get_text()
file.write(text + '\n\n') # 每个段落后添加空行以区分
定义输入文件路径和输出目录
html_path = r'S:\temp\vk\index.html'
output_dir = r'S:\temp\vk'
执行函数
extract_paragraphs_with_links(html_path, output_dir)`
from bs4 import BeautifulSoup
import os
def extract_paragraphs_with_links(html_path, output_dir):
# 确保输出目录存在
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 读取HTML文件
with open(html_path, 'r', encoding='utf-8') as file:
contents = file.read()
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(contents, 'lxml')
# 提取包含标签的所有标签
paragraphs = []
for p in soup.find_all('p'):
if p.find('a'): # 检查