确保输出目录存在

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 读取HTML文件
with open(html_path, 'r', encoding='utf-8') as file:
    contents = file.read()

# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(contents, 'lxml')

# 提取包含标签的所有标签
paragraphs = []
for p in soup.find_all('p'):
    if p.find('a'):  # 检查标签内是否包含标签
        paragraphs.append(p)

# 将结果写入新文件
output_path = os.path.join(output_dir, 'extracted_paragraphs.txt')
with open(output_path, 'w', encoding='utf-8') as file:
    for p in paragraphs:
        # 将标签内的所有内容转换为字符串，同时提取标签内的文本
        text = p.get_text()
        file.write(text + '\n\n')  # 每个段落后添加空行以区分

定义输入文件路径和输出目录

html_path = r'S:\temp\vk\index.html'
output_dir = r'S:\temp\vk'

执行函数

extract_paragraphs_with_links(html_path, output_dir)`


from bs4 import BeautifulSoup
import os

def extract_paragraphs_with_links(html_path, output_dir):
    # 确保输出目录存在
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 读取HTML文件
    with open(html_path, 'r', encoding='utf-8') as file:
        contents = file.read()

    # 使用BeautifulSoup解析HTML
    soup = BeautifulSoup(contents, 'lxml')

    # 提取包含标签的所有标签
    paragraphs = []
    for p in soup.find_all('p'):
        if p.find('a'):  # 检查标签内是否包含标签
            paragraphs.append(p)

    # 将结果写入新文件
    output_path = os.path.join(output_dir, 'extracted_paragraphs.txt')
    with open(output_path, 'w', encoding='utf-8') as file:
        for p in paragraphs:
            # 将标签内的所有内容转换为字符串，同时提取标签内的文本
            text = p.get_text()
            file.write(text + '\n\n')  # 每个段落后添加空行以区分

# 定义输入文件路径和输出目录
html_path = r'S:\temp\vk\index.html'
output_dir = r'S:\temp\vk'

# 执行函数
extract_paragraphs_with_links(html_path, output_dir)

提取p标签和文本

确保输出目录存在

定义输入文件路径和输出目录

执行函数

留下评论取消回复

确保输出目录存在

定义输入文件路径和输出目录

执行函数

相关文章

《今日简史》

常见对话

magazinelib

留下评论取消回复