通过本案例解析ajax请求返回的信息下载图片
爬取url地址:https://pic.sogou.com/pics?query=动物
分析url地址:每页25条数据,共计10页
第1页:https://pic.sogou.com/napi/pc/searchList?mode=1&start=0&xml_len=48&query=动物
第2页:https://pic.sogou.com/napi/pc/searchList?mode=1&start=48&xml_len=48&query=动物
第3页:https://pic.sogou.com/napi/pc/searchList?mode=1&start=96&xml_len=48&query=动物
通过分析得出请求改变start参数就可以改变页数
具体实现代码:
import requests
import os
class ImageSougou(object):
url = 'https://pic.sogou.com/napi/pc/searchList'
save_dir = './sougou' # 文件保存的路径
count = 0
# 初始化
def __init__(self, word):
self.word = word
self.dir_path = os.path.join(self.save_dir, word)
self.params = {
'query': word,
'mode': '1',
'start': '0',
'xml_len': 48,
}
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36',
}
# 创建保存图片的文件夹
self.folder_exist(self.dir_path)
self.last_img_url = ''
self.current_page = 0
# 请求包含图片url的网页
def parse(self):
while True:
self.params['start'] = str(self.current_page*48)
response = requests.get(url=self.url, headers=self.headers, params=self.params)
response.encoding='utf8'
data = response.json()['data']['items']
if data:
for img_info in data:
img_url = img_info['picUrl']
self.download(img_url)
self.current_page += 1
else:
break
# 下载一张图片
def download(self, img_url, img_type='jpg'):
self.count += 1
print('正在下载第%d张图片...'%self.count, img_url)
try:
response = requests.get(img_url)
except Exception as e:
print('下载失败:', img_url)
return None
img_name = img_url.split('/')[-1]
img_path = os.path.join(self.dir_path, img_name)
try:
with open(img_path, 'wb') as f:
f.write(response.content)
except Exception as e:
print('下载失败:', img_url)
def folder_exist(self, dir_path):
'''
1. 作用:判断文件夹路径是否存在,不存在则创建
2. 参数:dir_path:文件夹路径
3. 返回值:None
'''
if not os.path.exists(dir_path):
os.makedirs(dir_path)
if __name__ == '__main__':
image = ImageSougou('动物')
image.parse()
审核编辑:符乾江
全部0条评论
快来发表一下你的评论吧 !