想学习用python写网络爬虫,有专门介绍的书籍吗

如题所述

第1个回答  2016-03-24
这方面没有吧,我是根据网上的博客自己写的一个小的代码。。。。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding:utf-8 -*-#
#-python 2.7-#
import shutil
import urllib
import urllib2
import re,os,glob,string,sys
#=====================================================#
# MAIN #
#=====================================================#
def main():
#---also canset before the main() function---#
reload(sys)
sys.setdefaultencoding("utf-8" )
#---------------------------------------------#
global j #--global j is the name of the '.txt' file--#
j=1
url_home='http://www.qiushibaike.com'
url_lists=['/8hr/page/','/hot/page/','/history/page/']
path=os.getcwd()
ifos.path.exists(path+'\DUANZI'):
shutil.rmtree(path+'\DUANZI')
os.mkdir('DUANZI')
for url_listin url_lists:
ALL_CON(url_list,url_home)
#=====================================================#
# GET URL #
#=====================================================#
def ALL_CON(url1,url2):
i=1
lists=[]
while i:
# printi #----use for debug---#
url_real=url2+url1+str(i)
list1=GetPage(url_real)
DuanZi(list1)

if list1not in lists:
lists.append(list1)
i+=1
continue
else:
break
#======================================================#
# GET HTML CODE #
#======================================================#
def GetPage(url):
headers ={'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, likeGecko) Chrome/50.0.2652.2 Safari/537.36'}
req =urllib2.Request(url,headers = headers)
myResponse = urllib2.urlopen(req)
myResponse=myResponse.read()
myResponse=myResponse.decode('utf-8')
myResponse=re.sub('','',myResponse)
myResponse=re.sub('\t','',myResponse)
myResponse=re.sub('\n','',myResponse)
#---can usere.compile() instand of re.sub()---#
findall_duanzi=re.findall('<divclass="articleblockuntaggedmb15".*?<divclass="single-clear">',myResponse)
#---find allclass='article block untagged mb15' in HTML code &&return a list----#
i=0
whilei<len(findall_duanzi):
iffindall_duanzi[i].find('<imgsrc=') == -1:
i+=1
continue
else:
delfindall_duanzi[i]#-----delete imagine substance----#
i+=1
findall_list=''.join(findall_duanzi)
returnfindall_list
#===================================================#
# WRITE IN TXT FILE #
#===================================================#
def DuanZi(list1):
path=os.getcwd()
os.chdir(path+'\DUANZI')
duanzis=[]
duanzi=re.compile('<divclass="content">.*?</div>')
duanzis=duanzi.findall(list1)
txt_last=[]
global j
i=1
for x induanzis:
txt_name=str(j)+'.txt'
fp=open(txt_name,'w')
list2=[]
list2=(re.sub('<.*?>','',x)).split('"')#-----gain all the chianese below---#
txt_last=''.join(list2)
fp.write(txt_last)
fp.close()
i+=1
j+=1
os.chdir(path)
#============RUN===========================#
if __name__ == '__main__':
main()
上面这些代码,你只要改一下User-Agent的值就应该可以运行了本回答被提问者采纳
第2个回答  推荐于2017-08-16
八爪鱼采集器是任何一个需要从网页获取信息的人都必备的采集工具,它彻底改变了我对爬虫和采集器的认识,让网页数据采集变得前所未有的简单,如果你正在寻找一款好用的采集软件,八爪鱼绝对是最好的选择。本回答被网友采纳
第3个回答  2017-08-15
我都是在百度上搜索的!
书还是看python基本知识吧
相似回答