使用python提取网页中的文本信息

介绍

一个python脚本,用于获取《崩坏:星穹铁道》的剧情信息。大致思路是:

1.获取各章节的子链接;

2.获取子链接的html源码;

3.处理html,去除无效信息,并将各html标签转为md标签(如option转为’>’等,便于在blog中展示)

4.合并各章节内容,生成目录,并再次清洗

本次应用核心在于对信息的处理,例如对于特定的标签的特殊处理,标题范围的控制(处理不当会出现整段文章被识别为标题的问题),以及多余空行的清理等,整体难度甚至低于之前音乐爬取的应用。

编码

1.获取html信息,并按照章节保存。此步骤会对数据进行初步处理,并将html标签转为md的标签。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import os
import re
import urllib.parse
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException

# 获取所有章节链接
def extract_chapters(url):
# 发送请求获取网页内容
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')

results = []

# 查找所有章节容器
chapter_containers = soup.find_all('div', class_='mission-line')

for container in chapter_containers:
# 提取章节标题和链接
chapter_title = container.a.get_text(strip=True)
chapter_link = container.a['href']

# 查找关联的小章节容器
wrap_content = container.find_next('div', class_='wrap-content')

subsections = []
if wrap_content:
# 提取所有小章节
for subsection in wrap_content.find_all('div', class_='mission-line-child-kt'):
a_tag = subsection.find('a')
if a_tag:
sub_title = a_tag.get_text(strip=True)
sub_link = a_tag['href']
subsections.append({
'title': sub_title,
'link': sub_link
})

# 添加到结果集
results.append({
'chapter_title': chapter_title,
'chapter_link': chapter_link,
'subsections': subsections
})

links = []
for i, chapter in enumerate(results, 1):
link = [chapter['chapter_title']]
for j, sub in enumerate(chapter['subsections'], 1):
link.append(sub['link'])
links.append(link)

return links

# 处理html源码
def html_to_markdown(html):
soup = BeautifulSoup(html, 'html.parser')
# 移除不需要的元素
for tag in soup.find_all(['script', 'style']):
tag.decompose()

#针对对话短信内容优化
for msg in soup.find_all(['div'], class_=['MessageToMe', 'MessageFromMe']):
# 提取发送者名称
sender = msg.find('div', class_='SenderName')
sender_name = sender.get_text(strip=True) if sender else "未知发送者"
# 提取消息内容
content = ""
if msg.find('div', class_='MessageLeft'):
content = msg.find('div', class_=['MessageLeft', 'MessageRight']).get_text(strip=True)
elif msg.find('div', class_='MessageRight'):
content = msg.find('div', class_='MessageRight').get_text(strip=True)
msg.replace_with(f"{sender_name}: {content}")

for msgopt in soup.find_all(['div'], class_=['mailOptions']):
msgopt.replace_with(f"**{msgopt.get_text(strip=True)}**\n")

for tag in soup.find_all('div', {'style': 'display:none'}):
tag.decompose()
# 转换特殊标签
for ruby in soup.find_all('ruby'):
rb = ruby.find('rb').get_text(strip=True) if ruby.find('rb') else ''
rt = ruby.find('rt').get_text(strip=True) if ruby.find('rt') else ''
ruby.replace_with(f"{rb}({rt})" if rt else rb)

# 处理折叠框
for fold in soup.find_all(class_='foldFrame'):
title = fold.find(class_='foldTitle').get_text(strip=True) if fold.find(class_='foldTitle') else ''
content = fold.find(class_='foldContent')
if content:
content_str = content.get_text('\n', strip=True)
fold.replace_with(f"**{title}**\n{content_str}\n")

# 处理剧情选项
for plot in soup.find_all(class_='plotFrame'):
options = [opt.get_text(strip=True) for opt in plot.find_all(class_='plotOptions')]
# contents = [cont.get_text('\n', strip=True) for cont in plot.find_all(class_='content')]
contents = [cont.get_text(strip=True) for cont in plot.find_all(class_='content')]
result = "\n".join([f"- {opt}" for opt in options] + contents)
plot.replace_with(result)


# for img in soup.find_all('img'):
# text = img.get_text(strip=True)
# if img.get('alt'):
# img.replace_with(f"![{text}]({'https://wiki.biligame.com'+'alt'})")

# for a in soup.find_all('a'):
# text = a.get_text(strip=True)
# href = a.get('href', '')
# a.replace_with(f"![{text}]({'https://wiki.biligame.com'+href})" if href else text)

# 处理标题和列表
for h2 in soup.find_all('h2'):
h2.replace_with(f"## {h2.get_text(strip=True)}\n")

for h3 in soup.find_all('h3'):
h3.replace_with(f"**{h3.get_text(strip=True)}**\n")

for ul in soup.find_all('ul'):
for li in ul.find_all('li'):
li_text = li.get_text(strip=True)
li.string = f"{li_text}"
if ul.parent is not None:
ul.unwrap()

# 处理块引用
for blockquote in soup.find_all('blockquote'):
text = blockquote.get_text('\n', strip=True)
blockquote.replace_with(f"*{text}*\n")

# 处理定义列表
for dl in soup.find_all('dl'):
items = [f"- {dd.get_text(strip=True)}" for dd in dl.find_all('dd')]
dl.replace_with("\n".join(items) + "\n")

# 获取最终文本并清理
text = soup.get_text('\n', strip=True)
text = re.sub(r'\n{3,}', '\n\n', text) # 减少多余空行
# 删除多余内容
start_index = text.find("## 剧情内容")
# 截取从起始标记开始的内容
truncated_text = text[start_index:]
end_index = truncated_text.find("取自“")
# 截取结束标记之前的内容
text = truncated_text[:end_index]
# print(text.strip())
return text.strip()

# 创建文件
def create_folders_and_files(data_list, base_url):
folder_count = 1
for sublist in data_list:
if not sublist: # 跳过空子列表
continue

folder_name = str(folder_count)+sublist[0] # 提取中文文件夹名
folder_count+=1
# 创建文件夹(如果不存在)
os.makedirs(folder_name, exist_ok=True)
chapter_count = 1
for url_path in sublist[1:]:
# 提取/sr/后的内容
if "/sr/" not in url_path:
print(f"跳过无效链接: {url_path}")
continue

encoded_part = url_path.split("/sr/", 1)[1]
# URL解码获取中文文件名
file_name = str(chapter_count)+urllib.parse.unquote(encoded_part) + ".txt"
file_path = os.path.join(folder_name, file_name)

# 构建完整URL
full_url = base_url + url_path

# 下载链接内容
try:
response = requests.get(full_url)
text = html_to_markdown(response.text)

# 写入文件
with open(file_path, 'w', encoding='utf-8') as f:
f.write(text)
print(f"已创建文件: {file_path}")

except RequestException as e:
print(f"下载失败 {full_url}: {str(e)}")
except Exception as e:
print(f"处理错误: {str(e)}")
chapter_count+=1
chapter_count=1

# 使用示例
if __name__ == "__main__":
print("start")
BASE_URL="https://wiki.biligame.com"
url = "https://wiki.biligame.com/sr/%E5%BC%80%E6%8B%93%E4%BB%BB%E5%8A%A1"
create_folders_and_files(extract_chapters(url), BASE_URL)

2.将所有章节合并为一个文件,并针对性的对冗余信息进行剔除。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import re
import glob


def natural_sort_key(s):
"""生成自然排序键,用于处理数字顺序"""
return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]


def clean_title(name):
"""移除文件名开头的数字和分隔符,返回清理后的标题"""
# 匹配开头的数字及其后的分隔符(点、空格、破折号等)
match = re.match(r'^\d+[.\s_\-]*', name)
if match:
return name[match.end():].strip()
return name.strip()


def merge_txt_files(root_path, output_file="XT1.txt"):
"""合并文件夹中的txt文件"""
# 收集并排序文件夹(按开头的数字)
folders = []
for entry in os.scandir(root_path):
if entry.is_dir() and re.match(r'^\d', entry.name):
folders.append(entry.path)
folders.sort(key=lambda x: natural_sort_key(os.path.basename(x)))

with open(output_file, 'w', encoding='utf-8') as outfile:
for folder in folders:
# 添加一级标题(文件夹名称)
folder_name = os.path.basename(folder)
clean_folder = clean_title(folder_name)
outfile.write(f"# {clean_folder}\n\n")

# 收集并排序txt文件(按开头的数字)
txt_files = glob.glob(os.path.join(folder, "*.txt"))
txt_files = [f for f in txt_files if re.match(r'^\d', os.path.basename(f))]
txt_files.sort(key=lambda x: natural_sort_key(os.path.basename(x)))

for txt_file in txt_files:
# 添加二级标题(文件名)
file_name = os.path.splitext(os.path.basename(txt_file))[0]
clean_file = clean_title(file_name)
outfile.write(f"## {clean_file}\n\n")

# 写入文件内容
with open(txt_file, 'r', encoding='utf-8') as infile:
content = infile.read().strip()
outfile.write(process_text(content) + "\n\n")


def process_text(text):
# 处理单独成行的短横线
lines = []
for line in text.splitlines():
if line.strip() == '-': # 检测单独成行的短横线
line = line.replace('-', '', 1) # 只删除第一个短横线
lines.append(line)
# 重新组合文本
processed = '\n'.join(lines)
# 删除"## 剧情内容"
processed = processed.replace('## 剧情内容', '')
# 替换"## 注释"为"注释"
processed = processed.replace('## 注释', '注释')
return processed

if __name__ == "__main__":
# 使用示例 - 修改为你的实际路径
base_path = "C:/Users/lgf/PycharmProjects/PythonProject2" # 替换为你的文件夹路径
merge_txt_files(base_path)
print("finish,the txt saved in:"+base_path+"XT1.txt")

原神篇

1.网页信息获取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
from urllib.parse import unquote


# 将HTML源码转为MD
def gen_story(html_content):
# 清理无效信息
start_index = html_content.find('<span class="mw-headline" id="任务剧情">任务剧情')
start_index_2 = html_content.find('<span class="mw-headline"')
if start_index != -1:
html_content = html_content[start_index:]
elif start_index_2 != -1:
html_content = html_content[start_index_2:]
end_index = html_content.find('<div class="printfooter">取自“')
if end_index != -1:
html_content = html_content[:end_index]

# 正式处理
soup = BeautifulSoup(html_content, 'html.parser')
md_output = []

elements = soup.find_all(['h3', 'ul', 'p', 'div', 'dl', 'p', 'span'])
skip = 0
for element in elements:
# 处理标题
if element.name == 'h3':
title = element.get_text(strip=True)
md_output.append(f"***{title}***")

elif element.name == 'div' and 'foldContent' in element.get('class', []):
for delement in element.children:
if delement.name == 'p':
# 处理段落
text = delement.get_text(strip=True)
md_output.append(text)
skip = 1
elif delement.name is None and delement.strip():
# 处理纯文本节点
text = delement.strip()
md_output.append(text)


elif element.name == 'p':
if skip == 1:
skip = 0
continue
# 处理包含换行符的段落
content = []
for child in element.children:
if child.name == 'br':
# 换行符转换为实际换行
content.append('\n')
elif child.name is None: # 文本节点
content.append(child.string.strip())
elif child.name == 'span': # 处理行内span
content.append(child.get_text(strip=True))
# 组合内容并分割行
full_text = ''.join(content)
lines = [line for line in full_text.split('\n') if line.strip()]
# 添加处理后的行
for line in lines:
# 判断是否是描述性文本(没有冒号的行)
if ':' not in line and ':' not in line:
md_output.append(f"*{line}*")
else:
md_output.append(f"-{line}")


# 处理列表项
elif element.name == 'ul':
for li in element.find_all('li'):
text = li.get_text(strip=True)
md_output.append(f"{text}")

# 处理灰色描述文本
elif element.name == 'p':
if element.find('font', color='gray'):
text = element.get_text(strip=True)
md_output.append(f"*{text}*\n")

# 处理剧情选项
elif element.name == 'div' and 'plotFrame' in element.get('class', []):
for option in element.find_all('div', class_='plotOptions'):
if 'plotActive' in option.get('class', []):
text = option.get_text(strip=True).replace('\u00a0\u00a0', '') # 移除特殊空格
md_output.append(f"> {text}\n")

# 处理标题
# elif element.name == 'span' and 'mw-headline' in element.get('class', []):
# turetitle = element.get_text(strip=True)
# md_output.append(f"**{turetitle}**")

# 处理描述文本(dd)
elif element.name == 'dl':
if span := element.find('span'):
text = span.get_text(strip=True)
md_output.append(f"*{text}*\n")

# 组合成最终的Markdown
markdown_result = "\n".join(md_output)
return markdown_result

# 获取HTML源码
def fetch_html(url, timeout=10, max_retries=3, delay=1):
"""
获取指定URL的HTML源码,支持重试和延时
"""
retries = 0
while retries < max_retries:
try:
response = requests.get(url, timeout=timeout)
response.raise_for_status()
return response.text
except (requests.RequestException, requests.Timeout) as e:
retries += 1
print(f"请求失败: {e}. 重试 {retries}/{max_retries}...")
time.sleep(delay * retries)
print(f"无法获取 {url} 的内容")
return None

# 获取分章节链接
def process_notice_links(html):
"""
处理包含"详细任务内容,请查阅页面"的提示信息
返回提取的所有链接
"""
soup = BeautifulSoup(html, 'html.parser')
links = []

# 查找所有包含提示文本的div元素
notice_divs = soup.find_all(lambda tag: tag.name == 'div' and "详细任务内容,请查阅页面" in tag.text)

for div in soup.find_all('div', class_='tishi'):
if "详细任务内容,请查阅页面" in div.get_text():
# 在div内查找第一个<a>标签
a_tag = div.find('a')
if a_tag and a_tag.has_attr('href'):
links.append('https://wiki.biligame.com'+a_tag['href'])
print("---------------")
print(links)

return links

# 获取分章节内容
def get_full_content(url,i,name):
"""
获取指定URL的完整内容,包括所有提示链接的内容
"""
# 获取原始页面内容
base_html = fetch_html(url)
if base_html is None:
return None

# 处理提示信息并获取所有相关链接
notice_links = process_notice_links(base_html)

# 如果没有提示链接,直接返回原始内容
if not notice_links:
return base_html

# 获取所有提示链接的内容
all_contents = ""
for link in notice_links:
print(f"获取提示链接内容: {link}")
link_content = fetch_html(link)
story = gen_story(link_content)
if link_content:
all_contents+=story
with open(str(i) + "." + name + ".txt", "w", encoding="utf-8") as f:
f.write(all_contents)
return

# 请求处理,并保存为文件
def extract_chapter_links(url):
# 发送HTTP请求获取网页内容
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功

# 解析HTML内容
soup = BeautifulSoup(response.text, 'html.parser')

# 存储提取的链接
chapter_links = []

# 查找所有class="taskIcon"的div元素
task_icons = soup.find_all('div', class_='taskIcon')

for icon in task_icons:
# 在taskIcon内查找包含链接的<a>标签
link_tag = icon.find('a', href=True)
if link_tag:
# 获取相对URL并转换为绝对URL
relative_url = link_tag['href']
absolute_url = urljoin(url, relative_url)
chapter_links.append(absolute_url)

for i, url in enumerate(chapter_links, 1):
# 单链接调试
# if(i!=14):continue
print(f"\n处理章节 {i}: {url}")
encoded_text = url.rstrip('/').split('/')[-1]
# URL解码为中文
name = unquote(encoded_text)
full_content = get_full_content(url,i,name)

if full_content:
# 这里可以保存内容或进行进一步处理
print(f"获取到完整内容,长度: {len(full_content)} 字符")
story = gen_story(full_content)


# 实际应用中,可以将内容保存到文件
with open(str(i)+"."+name+".txt", "w", encoding="utf-8") as f:
f.write(story)

return chapter_links


# 使用示例
if __name__ == "__main__":
target_url = "https://wiki.biligame.com/ys/魔神任务" # 替换为实际网址
links = extract_chapter_links(target_url)
# print(get_full_content("https://wiki.biligame.com/ys/%E9%BB%8E%E6%98%8E"))

# gen_story(contant)

2.文本合并处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import re

# 设置每个合并文件包含的原始文件数量
FILES_PER_OUTPUT = 12


def merge_and_deduplicate_group(file_group, group_index):
"""合并一组文件并查重"""
output_filename = f'原神剧情_part_{group_index + 1}.txt'
last_content = None # 记录上一行有效内容



with open(output_filename, 'w', encoding='utf-8') as outfile:
for filename in file_group:
# 提取名字
name = re.search(r'^\d+\.(.+)\.txt$', filename).group(1)

# 写入名字标签
outfile.write(f"# {name}\n")

# 处理当前文件内容
with open(filename, 'r', encoding='utf-8') as infile:
for line in infile:
line = line.rstrip('\n') # 去除行尾换行符

# 跳过空行比较
if line.strip() == '':
outfile.write('\n') # 保留空行但跳过比较
continue

# 查重:与上一行有效内容相同就跳过!
if last_content is not None and line == last_content:
continue

# 写入新内容并更新记录
outfile.write(line + '\n')
last_content = line

# 文件之间添加分隔空行
outfile.write('\n\n')
last_content = None # 重置上一行内容

return output_filename


def main():
print("开始为分割合并文件")

# 收集所有txt文件
txt_files = [f for f in os.listdir() if f.endswith('.txt') and re.match(r'\d+\..+\.txt', f)]

# 按文件名数字排序
txt_files.sort(key=lambda x: int(re.search(r'^(\d+)', x).group(1)))

# 计算需要分成多少组~
total_files = len(txt_files)
num_groups = (total_files + FILES_PER_OUTPUT - 1) // FILES_PER_OUTPUT

print(f"发现 {total_files} 个文件 将分成 {num_groups} 组合并 (每组最多 {FILES_PER_OUTPUT} 个文件)")

# 分割文件成组并合并
output_files = []
for group_index in range(num_groups):
start_idx = group_index * FILES_PER_OUTPUT
end_idx = start_idx + FILES_PER_OUTPUT
file_group = txt_files[start_idx:end_idx]

print(f"🐾 正在处理第 {group_index + 1} 组: {', '.join(file_group)}")
output_file = merge_and_deduplicate_group(file_group, group_index)
output_files.append(output_file)

# 打印完成信息~
print("\n生成以下合并文件:")
for i, f in enumerate(output_files):
print(f" {i + 1}. {f}")


if __name__ == "__main__":
main()

成果展示

一点碎碎念

这个python脚本大部分内容是使用deepseek生成的,真是不得不感叹ai的发展速度如此之快。

Chatgpt是在我大二时(差不多2022年?)突然“异军突起”的,当时只当是一个先进的“聊天机器人”(当时最火的甚至是github某个猫娘提示词项目),还用它的api接入了我的QQ机器人当作消遣调戏,“人工智障”还是人们理所当然的调侃。

不久之后,发现它比我想的要智能的多,对用户的指令理解性能相较于“微软小冰”“聊天女仆萌萌”之流简直是降维打击,甚至能应付大学的作业题目和算法题目(虽然正确率感人)

到了2023年夏,我得到了第一份正式的工作,作为前端开发工程师使用vue+element UI开发网页,而chatgpt带来的巨大便利,让一个初入行的新手也能独立承担开发任务。

到如今,deepseek-r1已经能仅凭一个略显粗糙的描述,一次性编写出一个能够完美运行的脚本。也可以借助框架做出更复杂、深入的应用,绘画、视频、写作亦是百花齐放,很难辨别是人类作品还是AI生成。

当我们意识到AI融入我们的生活时,它已无处不在。

未来尚未可知,我们都将见证。