发布时间:2025-06-24 19:40:50  作者:北方职教升学中心  阅读量:063


简介

Instagram 是目前最热门的社交媒体平台之一,拥有大量优质的视频内容。在这篇文章中,我们将介绍如何使用 Python 编写一个脚本,来实现 Instagram 视频的批量下载和信息爬取。但是要逐一下载这些视频往往非常耗时。

结果展示

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

在这里插入图片描述

源码

想要获取源码的小伙伴加我哦 ,手把手教你部署使用哦
在这里插入图片描述

解析出每一个帖子的下载url downlod_urldownload_pattern =r'href="(https://scontent[^"]+)"'matches =re.findall(download_pattern,part)# # 输出匹配到的结果download_file=[]fori,matchinenumerate(matches,start=1):downlod_url =match.replace("amp;","")download_file.append(downloader(logger,downlod_url,file_dir,file_name))# 文件名desc_pattern =r'<div class="desc">([^"]+)follow'desc_matches =re.findall(desc_pattern,html_source)desc=""formatchindesc_matches:desc=matchlogger.info(f"desc:{match}")returndesc,download_fileparts =total_html_source.split('class="item">')posts_number =len(parts)-2logger.info(f"posts number:{posts_number}")forpost_index,part inenumerate(parts,start=0):id=""post_type =""post_time =""ifpost_index ==0orpost_index ==len(parts)-1:continuelogger.info(f"==================== post {post_index}=====================================")# 解析出每个帖子的时间和 IDtime_pattern =r'class="time">([^"]+)</div>'matches =re.findall(time_pattern,part)formatchinmatches:post_time =matchlogger.info(f"time:{match}")id_pattern =r'<a href="([^"]+)">'id_matches =re.findall(id_pattern,part)formatchinid_matches:id=matchlogger.info(f"id:{id}")# 根据帖子类型进行下载if'#ffffff'inpart:post_type ="Image Set"logger.info("post_type: Image Set")image_name_pex ="img"+str(post_index)desc,post_contents =image_set_downloader(logger,id,image_dir,image_name_pex)elif"video"inpart:post_type ="Video"logger.info("post_type: Video")video_name ="video"+str(post_index)+".mp4"desc,post_contents =image_or_video_downloader(logger,id,video_dir,video_name)else:logger.info("post_type: Image")post_type ="Image"img_name ="img"+str(post_index)+".jpg"desc,post_contents =image_or_video_downloader(logger,id,image_dir,img_name)# 将信息写入 Excel 文件exceller.write_row((post_index,post_time,post_type,desc,', '.join(post_contents)))

最后,我们调用上述定义的函数,实现图片/视频的下载和 Excel 文件的写入。解析出每一个帖子的下载url downlod_urldownload_pattern =r'data-proxy="" data-src="([^"]+)"'matches =re.findall(download_pattern,html_source)download_file=[]# # 输出匹配到的结果fori,matchinenumerate(matches,start=1):downlod_url =match.replace("amp;","")file_name=file_name_prx+"_"+str(i)+".jpg"download_file.append(downloader(logger,downlod_url,file_dir,file_name))desc_pattern =r'<div class="desc">([^"]+)follow'desc_matches =re.findall(desc_pattern,html_source)desc=""formatchindesc_matches:desc=matchlogger.info(f"desc:{match}")returndesc,download_filedefimage_or_video_downloader(logger,id,file_dir,file_name):logger.info("downloading image or video========")image_set_url="https://im"+idhtml_source=get_html_source(image_set_url)# # 打开或创建一个文件用于存储 HTML 源代码# with open(file_dir+file_name+".txt", 'w', encoding='utf-8') as file:# file.write(html_source)# 4、
我们使用selenium获取目标用户的 HTML 源代码,并将其保存在本地:

defget_html_source(html_url):option =webdriver.EdgeOptions()option.add_experimental_option("detach",True)# option.add_argument("--headless")  # 添加这一行设置 Edge 浏览器为无头模式  不会显示页面# 实例化浏览器驱动对象,并将配置浏览器选项driver =webdriver.Edge(options=option)# 等待元素出现,再执行操作driver.get(html_url)time.sleep(3)# ===============模拟操作鼠标滑轮====================i=1whileTrue:# 1. 滚动至页面底部last_height =driver.execute_script("return document.body.scrollHeighriver.execute_script("window.scrollTo(0,document.body.scrollHeight);")time.sleep(4)# 2. 检查是否已经滚动到底部new_height =driver.execute_script("return document.body.scrollHeight")ifnew_height ==last_height:breaklogger.info(f"Scrolled to page{i}")i +=1html_source=driver.page_source    driver.quit()returnhtml_sourcetotal_html_source =get_html_source(f'https://imn/{username}/')withopen(f'./downloads/{username}/html_source.txt','w',encoding='utf-8')asfile:file.write(total_html_source)

然后,我们遍历每个帖子,提取相关信息并下载对应的图片或视频:,注意不同类型的帖子,下载爬取方式不一样

defdownloader(logger,downlod_url,file_dir,file_name):logger.info(f"====>downloading:{file_name}")# 发送 HTTP 请求并下载视频response =requests.get(downlod_url,stream=True)# 检查请求是否成功ifresponse.status_code ==200:# 创建文件目录ifnotos.path.exists("downloads"):os.makedirs("downloads")# 获取文件大小total_size =int(response.headers.get('content-length',0))# 保存视频文件# file_path =os.path.join(file_dir,file_name)withopen(file_path,"wb")asf,tqdm(total=total_size,unit='B',unit_scale=True,unit_divisor=1024,ncols=80,desc=file_name)aspbar:forchunk inresponse.iter_content(chunk_size=1024):ifchunk:f.write(chunk)pbar.update(len(chunk))logger.info(f"downloaded and saved as {file_path}")returnfile_path    else:logger.info("Failed to download .")return"err"defimage_set_downloader(logger,id,file_dir,file_name_prx):logger.info("downloading image set========")image_set_url="https://imm"+idhtml_source=get_html_source(image_set_url)# # 打开或创建一个文件用于存储 HTML 源代码# with open(file_dir+file_name_prx+".txt", 'w', encoding='utf-8') as file:#     file.write(html_source)# 4、