requests小技巧
数据分类
数据处理之JSON
import json
import requests
class DouBanMovie:
def __init__(self):
self.url="https://m.douban.com/rexxar/api/v2/subject_collection/tv_domestic/items?os=android&for_mobile=1&start=0&count=18&loc_id=108288"
self.headers={
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Mobile Safari/537.36",
"Referer": "https://m.douban.com/tv/chinese"
}
def get_data(self):
response = requests.get(self.url, headers=self.headers)
if response.status_code==200:
# 将json字符串转化为python中的数据类型
result=json.loads(response.content.decode())
return result
def write_file(self,fileName,data):
'''写入文件'''
# json.dumps能够把python中的类型数据转化成json字符串
data=json.dumps(data,ensure_ascii=False)
with open(fileName,"w",encoding="utf-8") as f:
f.write(data)
def read_file(self,fileName):
'''读取文件数据'''
with open(fileName, "r", encoding="utf-8") as f:
# 加载json类型数据的文件
result=json.load(f)
return result
def run(self):
# 获取数据
result=self.get_data()
# 将豆瓣数据写入文件
self.write_file("douban.txt",result)
# 读取文件内容
readResult=self.read_file("douban.txt")
print(readResult)
if __name__ == '__main__':
douban=DouBanMovie()
douban.run()
数据处理之正则表达式
import requests
import re
class ChengYu:
def __init__(self):
self.url="http://www.hydcd.com/cy/gushi/0259hs.htm"
self.headers={
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Mobile Safari/537.36"
}
def get_data(self):
response=requests.get(self.url,self.headers)
result=None
if response.status_code==200:
result=response.content.decode("gb2312")
return result
def handle_data(self,html_str_list):
if html_str_list==None or len(html_str_list)==0:
return None
html_str=html_str_list[0]
result=re.sub(r"\\r|\\t|
","",html_str)
return result
def run(self):
# 访问网页信息
html_str=self.get_data()
# 用正则表达式提取 成语故事
html_str_list=re.findall(r"(.*?)",html_str,re.S)
# 处理语句中的换行、制表等标识体符
result=self.handle_data(html_str_list)
print(result)
if __name__ == '__main__':
chengYu=ChengYu()
chengYu.run()
数据处理之xpath
表达式 | 描述 |
---|---|
node name | 选取此节点的所有子节点 |
/ | 从根节点选取 |
// | 从匹配选择的当前节点选择文档中的节点,而不考虑它们的位置 |
. | 选取当前节点 |
.. | 选取当前节点的父节点 |
@ | 选取属性 |
import requests
import re
from lxml import etree
class ChengYu:
def __init__(self):
self.url="http://www.hydcd.com/cy/gushi/0259hs.htm"
self.headers={
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Mobile Safari/537.36"
}
def get_data(self):
response=requests.get(self.url,self.headers)
result=None
if response.status_code==200:
result=response.content.decode("gb2312")
return result
def handle_data(self,html_str_list):
if html_str_list==None or len(html_str_list)==0:
return None
html_str=html_str_list[0]+html_str_list[1]
result=re.sub(r"\\r|\\t|
","",html_str)
return result
def run(self):
# 访问网页信息
html_str=self.get_data()
# 用xpath提取元素
html=etree.HTML(html_str)
result=html.xpath("//font[@color=\\"#10102C\\"]/text()")
# 处理语句中的换行、制表等标识体符
result=self.handle_data(result)
print(result)
if __name__ == '__main__':
chengYu=ChengYu()
chengYu.run()
全部0条评论
快来发表一下你的评论吧 !