通过分析ajax中信息爬取图片

python爬虫知识分享 2022-03-23 1801

描述

通过本案例解析ajax请求返回的信息下载图片

爬取url地址：https://pic.sogou.com/pics?query=动物

分析：

分析url地址：每页25条数据，共计10页

第1页：https://pic.sogou.com/napi/pc/searchList?mode=1&start=0&xml_len=48&query=动物
第2页：https://pic.sogou.com/napi/pc/searchList?mode=1&start=48&xml_len=48&query=动物
第3页：https://pic.sogou.com/napi/pc/searchList?mode=1&start=96&xml_len=48&query=动物

通过分析得出请求改变start参数就可以改变页数

具体实现代码：

import requests
import os

class ImageSougou(object):
   url = 'https://pic.sogou.com/napi/pc/searchList'
   save_dir = './sougou' # 文件保存的路径
   count = 0
   # 初始化
   def __init__(self, word):
       self.word = word
       self.dir_path = os.path.join(self.save_dir, word)
       self.params = {
           'query': word,
           'mode': '1',
           'start': '0',
           'xml_len': 48,
       }
       self.headers = {
           'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36',
       }
       # 创建保存图片的文件夹
       self.folder_exist(self.dir_path)
       self.last_img_url = ''
       self.current_page = 0
   
   # 请求包含图片url的网页
   def parse(self):
       while True:
           self.params['start'] = str(self.current_page*48)
           response = requests.get(url=self.url, headers=self.headers, params=self.params)
           response.encoding='utf8'
           data = response.json()['data']['items']
           if data:
               for img_info in data:
                   img_url = img_info['picUrl']
                   self.download(img_url)
               self.current_page += 1
           else:
               break
   # 下载一张图片
   def download(self, img_url, img_type='jpg'):
       self.count += 1
       print('正在下载第%d张图片...'%self.count, img_url)
       try:
           response = requests.get(img_url)
       except Exception as e:
           print('下载失败:', img_url)
           return None
       img_name = img_url.split('/')[-1]
       img_path = os.path.join(self.dir_path, img_name)
       try:
           with open(img_path, 'wb') as f:
               f.write(response.content)
       except Exception as e:
           print('下载失败:', img_url)
   def folder_exist(self, dir_path):
       '''
       1. 作用:判断文件夹路径是否存在,不存在则创建
       2. 参数:dir_path:文件夹路径
       3. 返回值:None
       '''
       if not os.path.exists(dir_path):
           os.makedirs(dir_path)
if __name__ == '__main__':
   image = ImageSougou('动物')
   image.parse()

审核编辑：符乾江

打开APP阅读更多精彩内容