抓取知识星球指定日期内容

文章正文
发布时间:2025-12-25 20:31

本帖最后由 锤锤 于 2023-7-27 16:03 编辑

每天看生财的知识星球中风向标很是头疼,数据量太大了,所以一直想着写个小小的工具抓取指定日期的内容,直接看文本,有需要的话再看详情

于是乎快速写了一个抓取的PY,见下方代码

[Python] 纯文本查看 复制代码

import requests import json import time import datetime from bs4 import BeautifulSoup def make_api_request(start_time=None): url = 'https://api.zsxq.com/v2/hashtags/28514444451481/topics' params = { 'count': 20 } cookie = "填入Cookie" headers = { 'authority': 'api.zsxq.com', 'accept': 'application/json, text/plain, */*', 'cookie': cookie, 'origin': 'https://wx.zsxq.com', 'referer': 'https://wx.zsxq.com/', 'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Microsoft Edge";v="114"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': 'Windows', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-site', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', 'x-request-id': '378351101-680d-dd77-79a7-3a8c5e4a305', 'x-version': '2.40.0' # 'x-signature': '请在这里填写实际的x-signature值', # 'x-timestamp': '请在这里填写实际的x-timestamp值' } if start_time: params = { 'count': 20, 'end_time': start_time } response = requests.get(url, params=params, headers=headers) if response.status_code == 200: data = response.json() return data else: print("请求失败:", response.status_code) return None #下为判断字符串是否为%Y-%m-%d格式,其他格式同理。 def is_valid_date(strdate): try: time.strptime(strdate, "%Y-%m-%d %H:%M:%S") return True except: return False # 保存所有的response_data的列表 all_response_data = [] if __name__ == "__main__": # 设置设定的时间,例如:2023-07-27 00:00:00 target_time = input("设置设定的时间,例如:2023-07-27 00:00:00 :") if target_time == '' or not is_valid_date(target_time): target_time = datetime.datetime.now().strftime('%Y-%m-%d 00:00:00') # 循环请求直到create_time小于设定的时间 start_time = None while True: response_data = make_api_request(start_time) if response_data['succeeded']==False: time.sleep(10) continue # print(response_data) topics = response_data['resp_data']['topics'] if len(topics) > 0 : lens = len(topics) - 1 topic = topics[lens] if response_data and 'create_time' in topic: # print(topics) # 将response_data保存到列表中 for item in topics: all_response_data.append("- " + item['talk']['text']) create_time = topic['create_time'] print(create_time) if create_time < target_time: break start_time = create_time else: break # 添加1秒的延迟 time.sleep(10) print("请求循环结束!") # 函数用于从字符串中删除HTML元素 def remove_html_tags(text): soup = BeautifulSoup(text, "html.parser") return soup.get_text() # 处理列表中的每个字符串并删除HTML元素 clean_data_list = [remove_html_tags(text).replace('"', '').replace('\n',' ') for text in all_response_data] # Save all_response_data to txt file with 'utf-8' encoding file_name = datetime.datetime.now().strftime('%H-%M-%S') +".md" with open(file_name, 'w', encoding='utf-8') as file: for text in clean_data_list: file.write(text + "\n") print("所有的response_data已保存到txt文件:", file_name)




导入相关依赖库后,替换成你自己的Cookie就可以按时间抓取了,Cookie在这

image.png (46.66 KB, 下载次数: 0)

下载附件

2023-7-27 15:39 上传



导出的效果如下