使用python提取网页中的文本信息

ELIX2025-06-172025-07-12

介绍

一个python脚本，用于获取《崩坏：星穹铁道》的剧情信息。大致思路是：

1.获取各章节的子链接；

2.获取子链接的html源码；

3.处理html，去除无效信息，并将各html标签转为md标签（如option转为’>’等，便于在blog中展示）

4.合并各章节内容，生成目录，并再次清洗

本次应用核心在于对信息的处理，例如对于特定的标签的特殊处理，标题范围的控制（处理不当会出现整段文章被识别为标题的问题），以及多余空行的清理等，整体难度甚至低于之前音乐爬取的应用。

编码

1.获取html信息，并按照章节保存。此步骤会对数据进行初步处理，并将html标签转为md的标签。

import os
import re
import urllib.parse
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException

# 获取所有章节链接
def extract_chapters(url):
    # 发送请求获取网页内容
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')

    results = []

    # 查找所有章节容器
    chapter_containers = soup.find_all('div', class_='mission-line')

    for container in chapter_containers:
        # 提取章节标题和链接
        chapter_title = container.a.get_text(strip=True)
        chapter_link = container.a['href']

        # 查找关联的小章节容器
        wrap_content = container.find_next('div', class_='wrap-content')

        subsections = []
        if wrap_content:
            # 提取所有小章节
            for subsection in wrap_content.find_all('div', class_='mission-line-child-kt'):
                a_tag = subsection.find('a')
                if a_tag:
                    sub_title = a_tag.get_text(strip=True)
                    sub_link = a_tag['href']
                    subsections.append({
                        'title': sub_title,
                        'link': sub_link
                    })

        # 添加到结果集
        results.append({
            'chapter_title': chapter_title,
            'chapter_link': chapter_link,
            'subsections': subsections
        })

    links = []
    for i, chapter in enumerate(results, 1):
        link = [chapter['chapter_title']]
        for j, sub in enumerate(chapter['subsections'], 1):
            link.append(sub['link'])
        links.append(link)

    return links

# 处理html源码
def html_to_markdown(html):
    soup = BeautifulSoup(html, 'html.parser')
    # 移除不需要的元素
    for tag in soup.find_all(['script', 'style']):
        tag.decompose()

    #针对对话短信内容优化
    for msg in soup.find_all(['div'], class_=['MessageToMe', 'MessageFromMe']):
        # 提取发送者名称
        sender = msg.find('div', class_='SenderName')
        sender_name = sender.get_text(strip=True) if sender else "未知发送者"
        # 提取消息内容
        content = ""
        if msg.find('div', class_='MessageLeft'):
            content = msg.find('div', class_=['MessageLeft', 'MessageRight']).get_text(strip=True)
        elif msg.find('div', class_='MessageRight'):
            content = msg.find('div', class_='MessageRight').get_text(strip=True)
        msg.replace_with(f"{sender_name}: {content}")

    for msgopt in soup.find_all(['div'], class_=['mailOptions']):
        msgopt.replace_with(f"**{msgopt.get_text(strip=True)}**\n")

    for tag in soup.find_all('div', {'style': 'display:none'}):
        tag.decompose()
    # 转换特殊标签
    for ruby in soup.find_all('ruby'):
        rb = ruby.find('rb').get_text(strip=True) if ruby.find('rb') else ''
        rt = ruby.find('rt').get_text(strip=True) if ruby.find('rt') else ''
        ruby.replace_with(f"{rb}({rt})" if rt else rb)

    # 处理折叠框
    for fold in soup.find_all(class_='foldFrame'):
        title = fold.find(class_='foldTitle').get_text(strip=True) if fold.find(class_='foldTitle') else ''
        content = fold.find(class_='foldContent')
        if content:
            content_str = content.get_text('\n', strip=True)
            fold.replace_with(f"**{title}**\n{content_str}\n")

    # 处理剧情选项
    for plot in soup.find_all(class_='plotFrame'):
        options = [opt.get_text(strip=True) for opt in plot.find_all(class_='plotOptions')]
        # contents = [cont.get_text('\n', strip=True) for cont in plot.find_all(class_='content')]
        contents = [cont.get_text(strip=True) for cont in plot.find_all(class_='content')]
        result = "\n".join([f"- {opt}" for opt in options] + contents)
        plot.replace_with(result)


    # for img in soup.find_all('img'):
    #     text = img.get_text(strip=True)
    #     if img.get('alt'):
    #         img.replace_with(f"![{text}]({'https://wiki.biligame.com'+'alt'})")

    # for a in soup.find_all('a'):
    #     text = a.get_text(strip=True)
    #     href = a.get('href', '')
    #     a.replace_with(f"![{text}]({'https://wiki.biligame.com'+href})" if href else text)

    # 处理标题和列表
    for h2 in soup.find_all('h2'):
        h2.replace_with(f"## {h2.get_text(strip=True)}\n")

    for h3 in soup.find_all('h3'):
        h3.replace_with(f"**{h3.get_text(strip=True)}**\n")

    for ul in soup.find_all('ul'):
        for li in ul.find_all('li'):
            li_text = li.get_text(strip=True)
            li.string = f"{li_text}"
        if ul.parent is not None:
            ul.unwrap()

    # 处理块引用
    for blockquote in soup.find_all('blockquote'):
        text = blockquote.get_text('\n', strip=True)
        blockquote.replace_with(f"*{text}*\n")

    # 处理定义列表
    for dl in soup.find_all('dl'):
        items = [f"- {dd.get_text(strip=True)}" for dd in dl.find_all('dd')]
        dl.replace_with("\n".join(items) + "\n")

    # 获取最终文本并清理
    text = soup.get_text('\n', strip=True)
    text = re.sub(r'\n{3,}', '\n\n', text)  # 减少多余空行
    # 删除多余内容
    start_index = text.find("## 剧情内容")
    # 截取从起始标记开始的内容
    truncated_text = text[start_index:]
    end_index = truncated_text.find("取自“")
    # 截取结束标记之前的内容
    text = truncated_text[:end_index]
    # print(text.strip())
    return text.strip()

# 创建文件
def create_folders_and_files(data_list, base_url):
    folder_count = 1
    for sublist in data_list:
        if not sublist:  # 跳过空子列表
            continue

        folder_name = str(folder_count)+sublist[0]  # 提取中文文件夹名
        folder_count+=1
        # 创建文件夹（如果不存在）
        os.makedirs(folder_name, exist_ok=True)
        chapter_count = 1
        for url_path in sublist[1:]:
            # 提取/sr/后的内容
            if "/sr/" not in url_path:
                print(f"跳过无效链接: {url_path}")
                continue

            encoded_part = url_path.split("/sr/", 1)[1]
            # URL解码获取中文文件名
            file_name = str(chapter_count)+urllib.parse.unquote(encoded_part) + ".txt"
            file_path = os.path.join(folder_name, file_name)

            # 构建完整URL
            full_url = base_url + url_path

            # 下载链接内容
            try:
                response = requests.get(full_url)
                text = html_to_markdown(response.text)

                # 写入文件
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(text)
                print(f"已创建文件: {file_path}")

            except RequestException as e:
                print(f"下载失败 {full_url}: {str(e)}")
            except Exception as e:
                print(f"处理错误: {str(e)}")
            chapter_count+=1
        chapter_count=1

# 使用示例
if __name__ == "__main__":
    print("start")
    BASE_URL="https://wiki.biligame.com"
    url = "https://wiki.biligame.com/sr/%E5%BC%80%E6%8B%93%E4%BB%BB%E5%8A%A1"
    create_folders_and_files(extract_chapters(url), BASE_URL)

2.将所有章节合并为一个文件，并针对性的对冗余信息进行剔除。

import os
import re
import glob


def natural_sort_key(s):
    """生成自然排序键，用于处理数字顺序"""
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]


def clean_title(name):
    """移除文件名开头的数字和分隔符，返回清理后的标题"""
    # 匹配开头的数字及其后的分隔符（点、空格、破折号等）
    match = re.match(r'^\d+[.\s_\-]*', name)
    if match:
        return name[match.end():].strip()
    return name.strip()


def merge_txt_files(root_path, output_file="XT1.txt"):
    """合并文件夹中的txt文件"""
    # 收集并排序文件夹（按开头的数字）
    folders = []
    for entry in os.scandir(root_path):
        if entry.is_dir() and re.match(r'^\d', entry.name):
            folders.append(entry.path)
    folders.sort(key=lambda x: natural_sort_key(os.path.basename(x)))

    with open(output_file, 'w', encoding='utf-8') as outfile:
        for folder in folders:
            # 添加一级标题（文件夹名称）
            folder_name = os.path.basename(folder)
            clean_folder = clean_title(folder_name)
            outfile.write(f"# {clean_folder}\n\n")

            # 收集并排序txt文件（按开头的数字）
            txt_files = glob.glob(os.path.join(folder, "*.txt"))
            txt_files = [f for f in txt_files if re.match(r'^\d', os.path.basename(f))]
            txt_files.sort(key=lambda x: natural_sort_key(os.path.basename(x)))

            for txt_file in txt_files:
                # 添加二级标题（文件名）
                file_name = os.path.splitext(os.path.basename(txt_file))[0]
                clean_file = clean_title(file_name)
                outfile.write(f"## {clean_file}\n\n")

                # 写入文件内容
                with open(txt_file, 'r', encoding='utf-8') as infile:
                    content = infile.read().strip()
                    outfile.write(process_text(content) + "\n\n")


def process_text(text):
    # 处理单独成行的短横线
    lines = []
    for line in text.splitlines():
        if line.strip() == '-':  # 检测单独成行的短横线
            line = line.replace('-', '', 1)  # 只删除第一个短横线
        lines.append(line)
    # 重新组合文本
    processed = '\n'.join(lines)
    # 删除"## 剧情内容"
    processed = processed.replace('## 剧情内容', '')
    # 替换"## 注释"为"注释"
    processed = processed.replace('## 注释', '注释')
    return processed

if __name__ == "__main__":
    # 使用示例 - 修改为你的实际路径
    base_path = "C:/Users/lgf/PycharmProjects/PythonProject2"  # 替换为你的文件夹路径
    merge_txt_files(base_path)
    print("finish,the txt saved in:"+base_path+"XT1.txt")

原神篇

1.网页信息获取

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
from urllib.parse import unquote


# 将HTML源码转为MD
def gen_story(html_content):
    # 清理无效信息
    start_index = html_content.find('<span class="mw-headline" id="任务剧情">任务剧情')
    start_index_2 = html_content.find('<span class="mw-headline"')
    if start_index != -1:
        html_content = html_content[start_index:]
    elif start_index_2 != -1:
        html_content = html_content[start_index_2:]
    end_index = html_content.find('<div class="printfooter">取自“')
    if end_index != -1:
        html_content = html_content[:end_index]

    # 正式处理
    soup = BeautifulSoup(html_content, 'html.parser')
    md_output = []

    elements = soup.find_all(['h3', 'ul', 'p', 'div', 'dl', 'p', 'span'])
    skip = 0
    for element in elements:
        # 处理标题
        if element.name == 'h3':
            title = element.get_text(strip=True)
            md_output.append(f"***{title}***")

        elif element.name == 'div' and 'foldContent' in element.get('class', []):
            for delement in element.children:
                if delement.name == 'p':
                    # 处理段落
                    text = delement.get_text(strip=True)
                    md_output.append(text)
                    skip = 1
                elif delement.name is None and delement.strip():
                    # 处理纯文本节点
                    text = delement.strip()
                    md_output.append(text)


        elif element.name == 'p':
            if skip == 1:
                skip = 0
                continue
            # 处理包含换行符的段落
            content = []
            for child in element.children:
                if child.name == 'br':
                    # 换行符转换为实际换行
                    content.append('\n')
                elif child.name is None:  # 文本节点
                    content.append(child.string.strip())
                elif child.name == 'span':  # 处理行内span
                    content.append(child.get_text(strip=True))
            # 组合内容并分割行
            full_text = ''.join(content)
            lines = [line for line in full_text.split('\n') if line.strip()]
            # 添加处理后的行
            for line in lines:
                # 判断是否是描述性文本（没有冒号的行）
                if '：' not in line and ':' not in line:
                    md_output.append(f"*{line}*")
                else:
                    md_output.append(f"-{line}")


        # 处理列表项
        elif element.name == 'ul':
            for li in element.find_all('li'):
                text = li.get_text(strip=True)
                md_output.append(f"{text}")

        # 处理灰色描述文本
        elif element.name == 'p':
            if element.find('font', color='gray'):
                text = element.get_text(strip=True)
                md_output.append(f"*{text}*\n")

        # 处理剧情选项
        elif element.name == 'div' and 'plotFrame' in element.get('class', []):
            for option in element.find_all('div', class_='plotOptions'):
                if 'plotActive' in option.get('class', []):
                    text = option.get_text(strip=True).replace('\u00a0\u00a0', '')  # 移除特殊空格
                    md_output.append(f"> {text}\n")

        # 处理标题
        # elif element.name == 'span' and 'mw-headline' in element.get('class', []):
        #     turetitle = element.get_text(strip=True)
        #     md_output.append(f"**{turetitle}**")

        # 处理描述文本（dd）
        elif element.name == 'dl':
            if span := element.find('span'):
                text = span.get_text(strip=True)
                md_output.append(f"*{text}*\n")

    # 组合成最终的Markdown
    markdown_result = "\n".join(md_output)
    return markdown_result

# 获取HTML源码
def fetch_html(url, timeout=10, max_retries=3, delay=1):
    """
    获取指定URL的HTML源码，支持重试和延时
    """
    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(url, timeout=timeout)
            response.raise_for_status()
            return response.text
        except (requests.RequestException, requests.Timeout) as e:
            retries += 1
            print(f"请求失败: {e}. 重试 {retries}/{max_retries}...")
            time.sleep(delay * retries)
    print(f"无法获取 {url} 的内容")
    return None

# 获取分章节链接
def process_notice_links(html):
    """
    处理包含"详细任务内容，请查阅页面"的提示信息
    返回提取的所有链接
    """
    soup = BeautifulSoup(html, 'html.parser')
    links = []

    # 查找所有包含提示文本的div元素
    notice_divs = soup.find_all(lambda tag: tag.name == 'div' and "详细任务内容，请查阅页面" in tag.text)

    for div in soup.find_all('div', class_='tishi'):
        if "详细任务内容，请查阅页面" in div.get_text():
            # 在div内查找第一个<a>标签
            a_tag = div.find('a')
            if a_tag and a_tag.has_attr('href'):
                links.append('https://wiki.biligame.com'+a_tag['href'])
    print("---------------")
    print(links)

    return links

# 获取分章节内容
def get_full_content(url,i,name):
    """
    获取指定URL的完整内容，包括所有提示链接的内容
    """
    # 获取原始页面内容
    base_html = fetch_html(url)
    if base_html is None:
        return None

    # 处理提示信息并获取所有相关链接
    notice_links = process_notice_links(base_html)

    # 如果没有提示链接，直接返回原始内容
    if not notice_links:
        return base_html

    # 获取所有提示链接的内容
    all_contents = ""
    for link in notice_links:
        print(f"获取提示链接内容: {link}")
        link_content = fetch_html(link)
        story = gen_story(link_content)
        if link_content:
            all_contents+=story
    with open(str(i) + "." + name + ".txt", "w", encoding="utf-8") as f:
        f.write(all_contents)
    return

# 请求处理，并保存为文件
def extract_chapter_links(url):
    # 发送HTTP请求获取网页内容
    response = requests.get(url)
    response.raise_for_status()  # 检查请求是否成功

    # 解析HTML内容
    soup = BeautifulSoup(response.text, 'html.parser')

    # 存储提取的链接
    chapter_links = []

    # 查找所有class="taskIcon"的div元素
    task_icons = soup.find_all('div', class_='taskIcon')

    for icon in task_icons:
        # 在taskIcon内查找包含链接的<a>标签
        link_tag = icon.find('a', href=True)
        if link_tag:
            # 获取相对URL并转换为绝对URL
            relative_url = link_tag['href']
            absolute_url = urljoin(url, relative_url)
            chapter_links.append(absolute_url)

    for i, url in enumerate(chapter_links, 1):
        # 单链接调试
        # if(i!=14):continue
        print(f"\n处理章节 {i}: {url}")
        encoded_text = url.rstrip('/').split('/')[-1]
        # URL解码为中文
        name = unquote(encoded_text)
        full_content = get_full_content(url,i,name)

        if full_content:
            # 这里可以保存内容或进行进一步处理
            print(f"获取到完整内容，长度: {len(full_content)} 字符")
            story = gen_story(full_content)


            # 实际应用中，可以将内容保存到文件
            with open(str(i)+"."+name+".txt", "w", encoding="utf-8") as f:
                f.write(story)

    return chapter_links


# 使用示例
if __name__ == "__main__":
    target_url = "https://wiki.biligame.com/ys/魔神任务"  # 替换为实际网址
    links = extract_chapter_links(target_url)
    # print(get_full_content("https://wiki.biligame.com/ys/%E9%BB%8E%E6%98%8E"))

    # gen_story(contant)

2.文本合并处理

import os
import re

# 设置每个合并文件包含的原始文件数量
FILES_PER_OUTPUT = 12


def merge_and_deduplicate_group(file_group, group_index):
    """合并一组文件并查重"""
    output_filename = f'原神剧情_part_{group_index + 1}.txt'
    last_content = None  # 记录上一行有效内容



    with open(output_filename, 'w', encoding='utf-8') as outfile:
        for filename in file_group:
            # 提取名字
            name = re.search(r'^\d+\.(.+)\.txt$', filename).group(1)

            # 写入名字标签
            outfile.write(f"# {name}\n")

            # 处理当前文件内容
            with open(filename, 'r', encoding='utf-8') as infile:
                for line in infile:
                    line = line.rstrip('\n')  # 去除行尾换行符

                    # 跳过空行比较
                    if line.strip() == '':
                        outfile.write('\n')  # 保留空行但跳过比较
                        continue

                    # 查重：与上一行有效内容相同就跳过！
                    if last_content is not None and line == last_content:
                        continue

                    # 写入新内容并更新记录
                    outfile.write(line + '\n')
                    last_content = line

            # 文件之间添加分隔空行
            outfile.write('\n\n')
            last_content = None  # 重置上一行内容

    return output_filename


def main():
    print("开始为分割合并文件")

    # 收集所有txt文件
    txt_files = [f for f in os.listdir() if f.endswith('.txt') and re.match(r'\d+\..+\.txt', f)]

    # 按文件名数字排序
    txt_files.sort(key=lambda x: int(re.search(r'^(\d+)', x).group(1)))

    # 计算需要分成多少组~
    total_files = len(txt_files)
    num_groups = (total_files + FILES_PER_OUTPUT - 1) // FILES_PER_OUTPUT

    print(f"发现 {total_files} 个文件 将分成 {num_groups} 组合并 (每组最多 {FILES_PER_OUTPUT} 个文件)")

    # 分割文件成组并合并
    output_files = []
    for group_index in range(num_groups):
        start_idx = group_index * FILES_PER_OUTPUT
        end_idx = start_idx + FILES_PER_OUTPUT
        file_group = txt_files[start_idx:end_idx]

        print(f"🐾 正在处理第 {group_index + 1} 组: {', '.join(file_group)}")
        output_file = merge_and_deduplicate_group(file_group, group_index)
        output_files.append(output_file)

    # 打印完成信息~
    print("\n生成以下合并文件:")
    for i, f in enumerate(output_files):
        print(f"  {i + 1}. {f}")


if __name__ == "__main__":
    main()