links = [] for i, chapter inenumerate(results, 1): link = [chapter['chapter_title']] for j, sub inenumerate(chapter['subsections'], 1): link.append(sub['link']) links.append(link)
return links
# 处理html源码 defhtml_to_markdown(html): soup = BeautifulSoup(html, 'html.parser') # 移除不需要的元素 for tag in soup.find_all(['script', 'style']): tag.decompose()
for msgopt in soup.find_all(['div'], class_=['mailOptions']): msgopt.replace_with(f"**{msgopt.get_text(strip=True)}**\n")
for tag in soup.find_all('div', {'style': 'display:none'}): tag.decompose() # 转换特殊标签 for ruby in soup.find_all('ruby'): rb = ruby.find('rb').get_text(strip=True) if ruby.find('rb') else'' rt = ruby.find('rt').get_text(strip=True) if ruby.find('rt') else'' ruby.replace_with(f"{rb}({rt})"if rt else rb)
# 处理折叠框 for fold in soup.find_all(class_='foldFrame'): title = fold.find(class_='foldTitle').get_text(strip=True) if fold.find(class_='foldTitle') else'' content = fold.find(class_='foldContent') if content: content_str = content.get_text('\n', strip=True) fold.replace_with(f"**{title}**\n{content_str}\n")
# 处理剧情选项 for plot in soup.find_all(class_='plotFrame'): options = [opt.get_text(strip=True) for opt in plot.find_all(class_='plotOptions')] # contents = [cont.get_text('\n', strip=True) for cont in plot.find_all(class_='content')] contents = [cont.get_text(strip=True) for cont in plot.find_all(class_='content')] result = "\n".join([f"- {opt}"for opt in options] + contents) plot.replace_with(result)
# for img in soup.find_all('img'): # text = img.get_text(strip=True) # if img.get('alt'): # img.replace_with(f"")
# for a in soup.find_all('a'): # text = a.get_text(strip=True) # href = a.get('href', '') # a.replace_with(f"" if href else text)
# 处理标题和列表 for h2 in soup.find_all('h2'): h2.replace_with(f"## {h2.get_text(strip=True)}\n")
for h3 in soup.find_all('h3'): h3.replace_with(f"**{h3.get_text(strip=True)}**\n")
for ul in soup.find_all('ul'): for li in ul.find_all('li'): li_text = li.get_text(strip=True) li.string = f"{li_text}" if ul.parent isnotNone: ul.unwrap()
# 处理块引用 for blockquote in soup.find_all('blockquote'): text = blockquote.get_text('\n', strip=True) blockquote.replace_with(f"*{text}*\n")
# 处理定义列表 for dl in soup.find_all('dl'): items = [f"- {dd.get_text(strip=True)}"for dd in dl.find_all('dd')] dl.replace_with("\n".join(items) + "\n")
defmerge_txt_files(root_path, output_file="XT1.txt"): """合并文件夹中的txt文件""" # 收集并排序文件夹(按开头的数字) folders = [] for entry in os.scandir(root_path): if entry.is_dir() and re.match(r'^\d', entry.name): folders.append(entry.path) folders.sort(key=lambda x: natural_sort_key(os.path.basename(x)))
withopen(output_file, 'w', encoding='utf-8') as outfile: for folder in folders: # 添加一级标题(文件夹名称) folder_name = os.path.basename(folder) clean_folder = clean_title(folder_name) outfile.write(f"# {clean_folder}\n\n")
# 收集并排序txt文件(按开头的数字) txt_files = glob.glob(os.path.join(folder, "*.txt")) txt_files = [f for f in txt_files if re.match(r'^\d', os.path.basename(f))] txt_files.sort(key=lambda x: natural_sort_key(os.path.basename(x)))
for txt_file in txt_files: # 添加二级标题(文件名) file_name = os.path.splitext(os.path.basename(txt_file))[0] clean_file = clean_title(file_name) outfile.write(f"## {clean_file}\n\n")
elements = soup.find_all(['h3', 'ul', 'p', 'div', 'dl', 'p', 'span']) skip = 0 for element in elements: # 处理标题 if element.name == 'h3': title = element.get_text(strip=True) md_output.append(f"***{title}***")
elif element.name == 'div'and'foldContent'in element.get('class', []): for delement in element.children: if delement.name == 'p': # 处理段落 text = delement.get_text(strip=True) md_output.append(text) skip = 1 elif delement.name isNoneand delement.strip(): # 处理纯文本节点 text = delement.strip() md_output.append(text)
elif element.name == 'p': if skip == 1: skip = 0 continue # 处理包含换行符的段落 content = [] for child in element.children: if child.name == 'br': # 换行符转换为实际换行 content.append('\n') elif child.name isNone: # 文本节点 content.append(child.string.strip()) elif child.name == 'span': # 处理行内span content.append(child.get_text(strip=True)) # 组合内容并分割行 full_text = ''.join(content) lines = [line for line in full_text.split('\n') if line.strip()] # 添加处理后的行 for line in lines: # 判断是否是描述性文本(没有冒号的行) if':'notin line and':'notin line: md_output.append(f"*{line}*") else: md_output.append(f"-{line}")
# 处理列表项 elif element.name == 'ul': for li in element.find_all('li'): text = li.get_text(strip=True) md_output.append(f"{text}")
# 处理灰色描述文本 elif element.name == 'p': if element.find('font', color='gray'): text = element.get_text(strip=True) md_output.append(f"*{text}*\n")
# 处理剧情选项 elif element.name == 'div'and'plotFrame'in element.get('class', []): for option in element.find_all('div', class_='plotOptions'): if'plotActive'in option.get('class', []): text = option.get_text(strip=True).replace('\u00a0\u00a0', '') # 移除特殊空格 md_output.append(f"> {text}\n")
# 处理标题 # elif element.name == 'span' and 'mw-headline' in element.get('class', []): # turetitle = element.get_text(strip=True) # md_output.append(f"**{turetitle}**")
# 处理描述文本(dd) elif element.name == 'dl': if span := element.find('span'): text = span.get_text(strip=True) md_output.append(f"*{text}*\n")
for div in soup.find_all('div', class_='tishi'): if"详细任务内容,请查阅页面"in div.get_text(): # 在div内查找第一个<a>标签 a_tag = div.find('a') if a_tag and a_tag.has_attr('href'): links.append('https://wiki.biligame.com'+a_tag['href']) print("---------------") print(links)
# 获取所有提示链接的内容 all_contents = "" for link in notice_links: print(f"获取提示链接内容: {link}") link_content = fetch_html(link) story = gen_story(link_content) if link_content: all_contents+=story withopen(str(i) + "." + name + ".txt", "w", encoding="utf-8") as f: f.write(all_contents) return
withopen(output_filename, 'w', encoding='utf-8') as outfile: for filename in file_group: # 提取名字 name = re.search(r'^\d+\.(.+)\.txt$', filename).group(1)
# 写入名字标签 outfile.write(f"# {name}\n")
# 处理当前文件内容 withopen(filename, 'r', encoding='utf-8') as infile: for line in infile: line = line.rstrip('\n') # 去除行尾换行符
# 跳过空行比较 if line.strip() == '': outfile.write('\n') # 保留空行但跳过比较 continue
# 查重:与上一行有效内容相同就跳过! if last_content isnotNoneand line == last_content: continue
# 写入新内容并更新记录 outfile.write(line + '\n') last_content = line