需求:python采集百度热搜词并自动去重保存为txt文件,服务器可以加一个定时采集脚本,就会一直采集百度热搜词
import requests # 请求页面并解析出百度热搜榜单内容 response = requests.get('https://top.baidu.com/board?tab=realtime') content = response.text hot_topics = [] start_index = content.find('') while start_index != -1: end_index = content.find('', start_index) hot_topics.append(content[start_index+len(''):end_index].strip()) start_index = content.find('', end_index) # 将百度热搜榜单内容追加到现有的 txt 文件中,同时去重 filename = 'baiduresou.txt' try: with open(filename, 'r', encoding='utf-8') as f: existing_topics = set([line.strip() for line in f]) hot_topics = list(set(hot_topics) - existing_topics) + list(existing_topics) except FileNotFoundError: pass with open(filename, 'w', encoding='utf-8') as f: f.write('\n'.join(hot_topics))