Python爬取宅男女神网美女照片
先查看一下宅男女神网的爬虫协议,网址为‘
发现这个网站可以爬取‘
我只爬取了国家分类那一栏,其他的同理
先看看这个网页的源代码,美国的网址为:‘
每个国家下面不会超过20页,我设置爬取的页面从第一页到第二十页。
不存在的页面会没有专栏,可以根据正则表达式匹配到的链接数为0直接退出循环。
先把专栏的链接保存到列表1.
再去每个专栏,查看图片的链接。在保存到列表2.
再把列表2每个图片链接爬取出来。
难点:链接的分析,正则表达式的匹配,和分析。
代码下载链接:
需要在d盘建立 mv这个文件夹,不然会崩溃。没有用多线程,效率低,大约能爬去19万张图片,我爬取了八万多张,用了12G内存。
代码如下:
import requests
import re
import _thread
import os
def getHTMLText(url):
try:
kv = {user-agent:mozilla/5.0 (Windows nt 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36}
r = requests.get(url, headers = kv,timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print(遇到错误)
return
def parsePage(ilt,html):
try:
zongshu = re.findall(rspan style=color: #DB0909[0-9]*张照片/span,html)
temp=zongshu[0]
temp = temp[:-10]
temp = temp[29:]
plt = re.findall(rimg src=.{50,60}.jpg alt=,html)
url11 = plt[1]
url11 = url11[10:]
url11 = url11[:-12]
ilt.append(url11+.jpg)
for i in range(1,int(temp)):
if i10:
url2 = url11+ 0 + str(i) + .jpg
ilt.append(url2)
else:
url2 = url11 + str(i) + .jpg
ilt.append(url2)
except:
print(遇到错误2)
def parsePage1(ilt,html):
try:
url1 = re.findall(ra class=galleryli_link href=.{9} img,html)
if len(url1) ==0:
return
for i in range(len(url1)):
url = url1[i]
url = url[:-7]
url = url[32:]
url11 =
ilt.append(url11)
except:
print(遇到错误1)
def xiewenjian(lit):
print(123)
count = 0
path = D:mv
for url in lit:
path1 = path + str(count)+.jpg
print(path1)
try:
count = count + 1
r = requests.get(url)
with open(path1,wb) as f:
f.write(r.content)
f.close()
except:
print(连接失败)
continue
if __name__ ==__main__:
url =
#url1 = [/gallery/meiguo/]
url1 = [/gallery/meiguo/,/gallery/eluosi/,/gallery/wukelan/,/gallery/yingguo/,/gallery/faguo/,/gallery/deguo/,/gallery/yidali/,/gallery/xibanya/,/gallery/helan/,/gallery/jieke/,/gallery/keluodiya/,/gallery/danmai/,/gallery/tuerqi/,/gallery/ruidian/,/gallery/putaoya/,/gallery/xila/,/gallery/yazhou/,/gallery/rihan/,/gallery/yilang/,/gallery/aierlan/,/gallery/nuowei/,/gallery/neidi/,/gallery/taiwan/,/gallery/xianggang/,/gallery/aomen/,/gallery/riben/,/gallery/hanguo/,/gallery/malaixiya/,/gallery/yuenan/,/gallery/taiguo/,/gallery/feilvbin/,/gallery/hunxue/,/gallery/oumei/,/gallery/yindu/,/gallery/feizhou/]
houzhui = .html
zonglist = []
lit = []
for i in url1:
for j in range(1,20):
if j ==1:
zurl = url+i
else:
zurl = url + i + str(j) + houzhui
try:
html = getHTMLText(zurl)
parsePage1(zonglist,html)
except:
print(遇到错误)
for i in zonglist:
try:
html = getHTMLText(i)
parsePage(lit,html)
except:
print(遇到错误3)
continue
print(len(lit))
xiewenjian(lit)
如有不足,请多多指教。
|Python爬取宅男女神网美女照片
gallery Python python for循环 Python爬取宅男女神网美女照片