【Python】爬取公众号历史文章
时间:2025-06-24 12:26:04 来源:新华社
【字体:  

1、获取公众号首页地址

模版链接如下:

https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=xxxxxx

找一篇公众号文章,通过浏览器打开,按F12获取biz=MzkyNDY2OTgzOQ%3D%3D

获取公众号biz

替换模版链接中的biz使用微信浏览器打开

https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzkyNDY2OTgzOQ%3D%3D

2、安装配置抓包工具

下载地址:

https://reqable.com/zh-CN/

代理配置:打开工具后一般会自动配置这个,如果没有的话自己配置一下

代理配置

3、获取目标链接

1、打开抓包工具

打开抓包工具之后,点击启动。

抓包工具启动页面

2、打开目标公众号

关注目标公众号,并在微信中打开目标公众号的首页地址

打开目标公众号首页

目标公众号首页

3、获取历史文章接口

下拉浏览公众号历史文章,在抓包工具中获取公众号历史文章接口

offset=0&count=10 这两个参数是抓取数据的关键

目标接口地址

4、代码编写

1、header格式化

使用在线格式化工具

https://kgtools.cn/compression/header

使用则表达式替换

(.*?):(.*)   替换为 '1':'$2',

由于抓包工具的cookie显示成多个,header处理的时候先不管cookie

2、cookie获取

两种cookie内容不一样,任意选一种就可以了

抓包工具的最后一行cookie值也可以使用

抓包工具获取cookie

使用浏览器打开目标URL,按F12获取cookie

浏览器获取cookie

3、编写代码

由于cookie有时效性,当失效的时候需要重新抓包获取cookie

获取到原始数据之后就可以根据自己的业务逻辑进行处理了

# 导入依赖importrequests# 找到数据来源# https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MzUxMDkwMjI3NQ==&f=json&offset=22&count=10&is_ok=1&scene=&uin=NDExOTI3MTY5Ng%3D%3D&key=df31274cb01843671e8924a1ddb0a0a019eb5018d8976a8aefea0104e84a7ae9df90e2f5abebb9424c6420632271e97a016875dcabb2fc9d509edcd59cde65ebb5d23422f65fb76a9911b898014cb76988c544c5242d64293df6438604eef0c05de8c2b0c4f8a674ebb545444525579e9bc0f30e4cd0b85361851acc9a6e36d2&pass_ticket=eDPkihNNCPHyNvSr%2Bl3JjZgp9Xcar%2FOQmf20esaVQ0IQTviLsLUlgUAGpS06V0703%2FF9NCSRL2zsHv0HTKxkDQ%3D%3D&wxtoken=&appmsg_token=1264_uk0unMYho0t%252BNC3mb7IM_h5C5gDYjLX19ZGflw~~&x5=0&f=json# 目标链接url ='https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MzUxMDkwMjI3NQ==&f=json&offset=0&count=10&is_ok=1&scene=&uin=NDExOTI3MTY5Ng%3D%3D&key=df31274cb0184367880a3118e441be5b7574210fd2c1435b44fed9fe7aa75c6c61239e69d932af145f34f329bbdef7fd7c3c6af2e5a3b66a69ba916b93ab66a8a8f0c56840039476247e9d3b11c0c5ee0ee57f92be28d62b4b93645ea312620121ad65bf08c8d734860ab48aae6d1441c1c9af81b768beeb78e7551bfd299970&pass_ticket=eDPkihNNCPHyNvSr%2Bl3JjZgp9Xcar%2FOQmf20esaVQ0Lfrm%2Fzei1B%2FXTySk9z%2Fl3u4VctafhjncQk%2BUOCVAVcUQ%3D%3D&wxtoken=&appmsg_token=1264_45nRYND3xEvbRE5Lu4z_bxwkoUh4J19apdr1tg~~&x5=0&f=json'# 定义请求头cookie ='wap_sid2=CJCynKwPEooBeV9IUFdxZDFPdmRoTm9mSEk1R3dha1k0LUtab3VSdnNCVWVTT2hJNWlPQ1JiY2otZEs5MmxiQ1I5RUcxdHJTdlQteFMtTExhdUItVkRnQTgtaUcybmU2d1FvTDJ6d2ktdlpvVW50dGc3em91X0dwckdPb1d6U005Z2xOTTRrVWFmUmVjNFNBQUF+MMDz07AGOA1AlU4='headers ={ 'host':'mp.weixin.qq.com','user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x6309092b) XWEB/9079 Flue','x-requested-with':'XMLHttpRequest','accept':'*/*','sec-fetch-site':'same-origin','sec-fetch-mode':'cors','sec-fetch-dest':'empty','referer':'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzUxMDkwMjI3NQ%3D%3D&uin=NDExOTI3MTY5Ng%3D%3D&key=df31274cb01843671e8924a1ddb0a0a019eb5018d8976a8aefea0104e84a7ae9df90e2f5abebb9424c6420632271e97a016875dcabb2fc9d509edcd59cde65ebb5d23422f65fb76a9911b898014cb76988c544c5242d64293df6438604eef0c05de8c2b0c4f8a674ebb545444525579e9bc0f30e4cd0b85361851acc9a6e36d2&devicetype=Windows+10+x64&version=6309092b&lang=zh_CN&a8scene=0&acctmode=0&pass_ticket=eDPkihNNCPHyNvSr%2Bl3JjZgp9Xcar%2FOQmf20esaVQ0IQTviLsLUlgUAGpS06V0703%2FF9NCSRL2zsHv0HTKxkDQ%3D%3D&wx_header=1','accept-encoding':'gzip, deflate, br','accept-language':'zh-CN,zh;q=0.9','cookie':cookie}# 获取网页res =requests.get(url=url,headers=headers)# 打印内容print(res.text)

4、session过期返回

{ "base_resp":{ "ret":-3,"errmsg":"no session","cookie_count":0},"ret":-3,"errmsg":"no session","cookie_count":0}

5、没有数据返回

{ 'ret': 0, 'errmsg': 'ok', 'msg_count': 0, 'can_msg_continue': 0, 'general_msg_list': '{ "list":[]}', 'next_offset': 5000, 'video_count': 1, 'use_video_tab': 1, 'real_type': 0, 'home_page_list': []}

6、被封返回

{ 'ret': -6, 'errmsg': 'unknown error', 'home_page_list': []}

5、代码展示

# 导入依赖import requestsimport jsonimport urllib.parseimport datetimeimport timeimport random# 获取url参数def extract_url_params(url):    params = { }    parsed_url = urllib.parse.urlparse(url)    query_string = parsed_url.query    if query_string:        query_params = urllib.parse.parse_qs(query_string)        for key, value in query_params.items():            params[key] = value[0]    return params# 判断值是否为空def is_value_empty(value):    return value is None or value == "" or value == [] or value == { }# 将文章列表写入csv文件def write_article_list_to_csv(article_list):    with open('article_list.csv', 'a', encoding='utf-8') as f:        f.write('文章标题,文章地址,发布时间\n')        for article in article_list:            f.write(f'{ article["title"]},{ article["content_url"]},'                    f'{ datetime.datetime.fromtimestamp(article["datetime"])}\n')#  获取公众号文章列表def get_article_list(refer):    # 定义文章列表    article_list = []    # 目标链接    url = 'https://mp.weixin.qq.com/mp/profile_ext'    # 定义请求头    headers = {         'host': 'mp.weixin.qq.com',        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x6309092b) XWEB/9079 Flue',    }    # 获取refer链接中的参数    refer_params = extract_url_params(refer)    # 定义请求参数    params = {         'action': 'getmsg',        '__biz': refer_params['__biz'],        'offset': 0,        'count': 10,        'uin': refer_params['uin'],        'key': refer_params['key'],        'pass_ticket': refer_params['pass_ticket'],        'f': 'json'    }    while True:        # 随机时间20-30S        time.sleep(random.randint(20, 30))        # 发送请求        res = requests.get(url=url, headers=headers, params=params)        # 获取json数据        json_data = res.json()        if json_data['ret'] == -6:            print(json_data)            print('爬虫速度太快被封了...')            break        elif json_data['ret'] == -3:            print(json_data)            print('会话过期了...')            break        elif json_data['ret'] == 0:            if 'can_msg_continue' in json_data:                # 获取general_msg_list 这是一个字符串需要格式化成json数据                general_msg_list = json.loads(json_data['general_msg_list'])                # 遍历list数据                for msg in general_msg_list['list']:                    if 'app_msg_ext_info' in msg and 'comm_msg_info' in msg:                        # 获取标题                        title = msg['app_msg_ext_info']['title']                        # 获取链接                        content_url = msg['app_msg_ext_info']['content_url']                        # 获取发布时间                        datetime = msg['comm_msg_info']['datetime']                        # 标题 链接 发布时间 有一个为空就不处理                        if is_value_empty(title) or is_value_empty(content_url) or is_value_empty(datetime):                            continue                        article_list.append({ 'title': title, 'content_url': content_url, 'datetime': datetime})                        print(title, content_url)                # 没有数据退出                can_msg_continue = json_data['can_msg_continue']                if can_msg_continue == 0:                    print('数据读取完毕!')                    print(json_data)                    break                # 获取下一次的偏移                params['offset'] = json_data['next_offset']    # 将文章列表写入csv文件    write_article_list_to_csv(article_list)if __name__ == '__main__':    refer = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzkzOTIwMjkyMg==&uin=MjI4OTEzMzMwMQ%3D%3D&key=0e25620c5de632befc36897216780fd24f1198cfede94a6e8a9f71139b940c2a2338bb0ea32efc7069b32659ab29f038474583a26a9446e010603f6662787f2b6fcd81a432d8530af7f2acdba318632b557b97cd65e9bfa716f542d87f30578f6bff657b611e0a7587f3864942fd46ab4ee37ed36a623418028c89e67fe327ee&devicetype=Windows+10+x64&version=6309092b&lang=zh_CN&a8scene=0&acctmode=0&pass_ticket=TzBKMHSun3KkQLaEApYHnWfMot5QQXiQSTLU0VYg4xN3LWA0wbcJ0O6HjSR9W4tnT3z5JIM9a73asCxiskMLbQ%3D%3D&wx_header=1'    start_time = time.time()    get_article_list(refer)    end_time = time.time()    print("爬取数据耗时:", end_time - start_time)

[责任编辑:百度一下]
检察日报数字报 | 正义网 |
Copyrights©最高人民检察院 All Rights Reserved.