需求:python采集百度热搜词并自动去重保存为txt文件,服务器可以加一个定时采集脚本,就会一直采集百度热搜词,把第一版的脚本用类封装起来。
import requests class BaiduHotTopicsScraper: def __init__(self, url, filename): self.url = url self.filename = filename def scrape_and_update_file(self): # 请求页面并解析出百度热搜榜单内容 response = requests.get(self.url) content = response.text hot_topics = [] start_index = content.find('') while start_index != -1: end_index = content.find('', start_index) hot_topics.append(content[start_index + len(''):end_index].strip()) start_index = content.find('', end_index) # 读取现有文件中的内容并去重 try: with open(self.filename, 'r', encoding='utf-8') as f: existing_topics = set([line.strip() for line in f]) hot_topics = list(set(hot_topics) - existing_topics) + list(existing_topics) except FileNotFoundError: pass # 将百度热搜榜单内容追加到现有的 txt 文件中,同时去重 try: with open(self.filename, 'w', encoding='utf-8') as f: f.write('\n'.join(hot_topics)) print(f"内容已成功添加到文件 {self.filename} 并排重。") except Exception as e: print(f"保存文件时出现错误: {e}") if __name__ == "__main__": url = 'https://top.baidu.com/board?tab=realtime' filename = 'baiduresou.txt' scraper = BaiduHotTopicsScraper(url, filename) scraper.scrape_and_update_file()