Python爬虫(3) 异常处理、正则、下载文件
URLError异常处理
SRE实战 互联网时代守护先锋,助力企业售后服务体系运筹帷幄!一键直达领取阿里云限量特价优惠。
例:from urllib import request as sa
from urllib import error as er
try:
sa.urlopen('http://blog.csdn.net')
except er.HTTPError as ee:
print(ee.code)
print(e.reason)
except er.URLError as e:
if hasattr(e,'code'):
print(e.code)
if hasattr(e,'reason'):
print(e.reason)
search() 正则表达式
例:import re
p1 = 'py.*n'
p2 = 'cd{2}'
p3 = 'cd{3}'
p4 = 'cd{2,}'
s = 'abcddddefphp345python_py'
r1 = re.search(p1,s)
r2 = re.search(p2,s)
r3 = re.search(p3,s)
r4 = re.search(p4,s)
print(r1)
print(r2)
print(r3)
print(r4)
urlretrieve() 下载文件
例:import urllib.request as res
import urllib.error as er
import re
headers = ('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6776.400 QQBrowser/10.3.2601.400')
def sd(u,y):
dds = res.build_opener()
dds.addheaders = [headers]
p = '<img width="220" height="220" data-img="1" src="//(.+?\.jpg)">'
h = res.urlopen(u).read()
o = re.compile(p).findall(str(h))
x = 1
for i in o:
im = './'+str(y)+str(x)+'.jpg'
ig = 'http://'+i
print(i)
try:
res.urlretrieve(ig,filename=im)
except er.URLError as e:
if hasattr(e,'code'):
x+=1
if hasattr(e,'reason'):
x+=1
x+=1
for y in range(1,50):
u = 'https://list.jd.com/list.html?cat=9987,653,655&page='+str(y);
sd(u, y)
链接获取
例:import urllib.request as res
import urllib.error as er
import re
def getlink(url):
#模拟浏览器
headers = ('User-Agent',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6776.400 QQBrowser/10.3.2601.400')
op = res.build_opener()
op.addheaders = [headers]
#将opener安装为全局
res.install_opener(op)
f = res.urlopen(url)
d = str(f.read())
print(d)
#根据需求构建好链接表达式
p = '(https?://[^s)";]+\.(\w|/)*)'
l = re.compile(p).findall(d)
#去除重复元素
l = list(set(l))
return l
url = "https://www.landi.com/";
l = getlink(url)
print(l)
for ll in l:
print(ll[0])
