Python爬虫(3) 异常处理、正则、下载文件

佚名 7年前 (2019-04-08) 随笔 388人围观抢沙发百度已收录

URLError异常处理

SRE实战互联网时代守护先锋，助力企业售后服务体系运筹帷幄！一键直达领取阿里云限量特价优惠。

例：from urllib import request as sa

from urllib import error as er

try:

sa.urlopen('http://blog.csdn.net')

except er.HTTPError as ee:

print(ee.code)

print(e.reason)

except er.URLError as e:

if hasattr(e,'code'):

print(e.code)

if hasattr(e,'reason'):

print(e.reason)

search() 正则表达式

例：import re

p1 = 'py.*n'

p2 = 'cd{2}'

p3 = 'cd{3}'

p4 = 'cd{2,}'

s = 'abcddddefphp345python_py'

r1 = re.search(p1,s)

r2 = re.search(p2,s)

r3 = re.search(p3,s)

r4 = re.search(p4,s)

print(r1)

print(r2)

print(r3)

print(r4)

urlretrieve() 下载文件

例：import urllib.request as res

import urllib.error as er

import re

headers = ('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6776.400 QQBrowser/10.3.2601.400')

def sd(u,y):

dds = res.build_opener()

dds.addheaders = [headers]

p = '<img width="220" height="220" data-img="1" src="//(.+?\.jpg)">'

h = res.urlopen(u).read()

o = re.compile(p).findall(str(h))

x = 1

for i in o:

im = './'+str(y)+str(x)+'.jpg'

ig = 'http://'+i

print(i)

try:

res.urlretrieve(ig,filename=im)

except er.URLError as e:

if hasattr(e,'code'):

x+=1

if hasattr(e,'reason'):

x+=1

for y in range(1,50):

u = 'https://list.jd.com/list.html?cat=9987,653,655&page='+str(y);

sd(u, y)

链接获取

例：import urllib.request as res

import urllib.error as er

import re

def getlink(url):

#模拟浏览器

headers = ('User-Agent',

'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6776.400 QQBrowser/10.3.2601.400')

op = res.build_opener()

op.addheaders = [headers]

#将opener安装为全局

res.install_opener(op)

f = res.urlopen(url)

d = str(f.read())

print(d)

#根据需求构建好链接表达式

p = '(https?://[^s)";]+\.(\w|/)*)'

l = re.compile(p).findall(d)

#去除重复元素

l = list(set(l))