提取p标签和文本

`from bs4 import BeautifulSoup
import os

def extract_paragraphs_with_links(html_path, output_dir):

确保输出目录存在

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
# 读取HTML文件
with open(html_path, 'r', encoding='utf-8') as file:
    contents = file.read()

# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(contents, 'lxml')

# 提取包含标签的所有标签
paragraphs = []
for p in soup.find_all('p'):
    if p.find('a'):  # 检查标签内是否包含标签
        paragraphs.append(p)

# 将结果写入新文件
output_path = os.path.join(output_dir, 'extracted_paragraphs.txt')
with open(output_path, 'w', encoding='utf-8') as file:
    for p in paragraphs:
        # 将标签内的所有内容转换为字符串,同时提取标签内的文本
        text = p.get_text()
        file.write(text + '\n\n')  # 每个段落后添加空行以区分

定义输入文件路径和输出目录

html_path = r'S:\temp\vk\index.html'
output_dir = r'S:\temp\vk'

执行函数

extract_paragraphs_with_links(html_path, output_dir)`


from bs4 import BeautifulSoup
import os

def extract_paragraphs_with_links(html_path, output_dir):
    # 确保输出目录存在
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 读取HTML文件
    with open(html_path, 'r', encoding='utf-8') as file:
        contents = file.read()

    # 使用BeautifulSoup解析HTML
    soup = BeautifulSoup(contents, 'lxml')

    # 提取包含标签的所有

标签 paragraphs = [] for p in soup.find_all('p'): if p.find('a'): # 检查

标签内是否包含标签 paragraphs.append(p) # 将结果写入新文件 output_path = os.path.join(output_dir, 'extracted_paragraphs.txt') with open(output_path, 'w', encoding='utf-8') as file: for p in paragraphs: # 将

标签内的所有内容转换为字符串,同时提取标签内的文本 text = p.get_text() file.write(text + '\n\n') # 每个段落后添加空行以区分 # 定义输入文件路径和输出目录 html_path = r'S:\temp\vk\index.html' output_dir = r'S:\temp\vk' # 执行函数 extract_paragraphs_with_links(html_path, output_dir)

留下评论

您的邮箱地址不会被公开。 必填项已用 * 标注