博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
百度图片爬虫-python版
阅读量:6892 次
发布时间:2019-06-27

本文共 12760 字,大约阅读时间需要 42 分钟。

  1 
#
coding:utf-8
  2 
  3 
"""
  4 
  5 
Created on 2015-9-17
  6 
  7 
 
  8 
  9 
@author: huangxie
 10 
 11 
"""
 12 
 13 
import time,math,os,re,urllib,urllib2,cookielib 
 14 
 15 
from bs4 
import BeautifulSoup
 16 
 17 
import time  
 18 
 19 
import re
 20 
 21 
import uuid
 22 
 23 
import json
 24 
 25 
from threading 
import Thread
 26 
 27 
from Queue 
import Queue 
 28 
 29 
import MySQLdb as mdb
 30 
 31 
import sys
 32 
 33 
import threading
 34 
 35 
import utils
 36 
 37 
import imitate_browser
 38 
 39 
from MySQLdb.constants.REFRESH 
import STATUS
 40 
 41 reload(sys)
 42 
 43 sys.setdefaultencoding(
'
utf-8
')
 44 
 45  
 46 
 47 DB_HOST = 
'
127.0.0.1
'
 48 
 49 DB_USER = 
'
root
'
 50 
 51 DB_PASS = 
'
root
'
 52 
 53 proxy = {u
'
http
':u
'
222.39.64.13:8118
'}
 54 
 55 TOP_URL=
"
http://image.baidu.com/i?tn=resultjsonavatarnew&ie=utf-8&word={word}&pn={pn}&rn={rn}
"
 56 
 57 KEYWORD_URL=
"
https://www.baidu.com/s?ie=utf-8&f=8&tn=baidu&wd={wd}
"
 58 
 59  
 60 
 61 
"""
 62 
 63 
i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
 64 
 65 
              'Accept':'json;q=0.9,*/*;q=0.8',
 66 
 67 
              'Accept-Charset':'utf-8;q=0.7,*;q=0.3',
 68 
 69 
              'Accept-Encoding':'gzip',
 70 
 71 
              'Connection':'close',
 72 
 73 
              'Referer':None #注意如果依然不能抓取的话,这里可以设置抓取网站的host
 74 
 75 
            }
 76 
 77 
"""
 78 
 79 i_headers = {
'
User-Agent
':
'
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48
'}
 80 
 81  
 82 
 83 
def GetDateString():
 84 
 85     x = time.localtime(time.time())
 86 
 87     foldername = str(x.
__getattribute__(
"
tm_year
"))+
"
-
"+str(x.
__getattribute__(
"
tm_mon
"))+
"
-
"+str(x.
__getattribute__(
"
tm_mday
"))
 88 
 89     
return foldername 
 90 
 91  
 92 
 93 
class BaiduImage(threading.Thread):     
 94 
 95  
 96 
 97     
def 
__init__(self):
 98 
 99         Thread.
__init__(self)
100 
101         self.browser=imitate_browser.BrowserBase()
102 
103         self.chance=0
104 
105         self.chance1=0
106 
107         self.request_queue=Queue()
108 
109         self.wait_ana_queue=Queue()
110 
111         
#
self.key_word_queue.put((("动态图", 0, 24)))
112 
113         self.count=0
114 
115         self.mutex = threading.RLock() 
#
可重入锁,使单线程可以再次获得已经获得的锁
116 
117         self.commit_count=0
118 
119         self.ID=500
120 
121         self.next_proxy_set = set()
122 
123         self.dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 
'
sosogif
', charset=
'
utf8
')
124 
125         self.dbconn.autocommit(False)
126 
127         self.dbcurr = self.dbconn.cursor()
128 
129         self.dbcurr.execute(
'
SET NAMES utf8
')
130 
131         
132 
133     
"""
134 
135 
    def run(self):
136 
137 
        while True:
138 
139 
            self.get_pic()
140 
141 
    
"""
142 
143     
144 
145     
def work(self,item):
146 
147         
print 
"
start thread
",item
148 
149         
while True: 
#
MAX_REQUEST条以上则等待
150 
151             self.get_pic()
152 
153             self.prepare_request()
154 
155     
156 
157     
def format_keyword_url(self,keyword):
158 
159   
160 
161         
return KEYWORD_URL.format(wd=keyword).encode(
'
utf-8
'
162 
163            
164 
165     
def generateSeed(self,url):
166 
167         
168 
169         html = self.browser.openurl(url).read()
170 
171         
if html:
172 
173             
try:
174 
175                 soup = BeautifulSoup(html)
176 
177                 trs = soup.find(
'
div
', id=
'
rs
').find(
'
table
').find_all(
'
tr
'
#
获得所有行
178 
179                 
for tr 
in trs:
180 
181                     ths=tr.find_all(
'
th
')
182 
183                     
for th 
in ths:
184 
185                         a=th.find_all(
'
a
')[0]
186 
187                         keyword=a.text.strip()
188 
189                         
if 
"
动态图
" 
in keyword 
or 
"
gif
" 
in keyword:
190 
191                             
print 
"
keyword
",keyword
192 
193                             self.dbcurr.execute(
'
select id from info where word=%s
',(keyword))
194 
195                             y = self.dbcurr.fetchone()
196 
197                             
if 
not y:
198 
199                                 self.dbcurr.execute(
'
INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,0,0,0,0)
',(keyword))
200 
201                     self.dbconn.commit()
202 
203             
except:
204 
205                 
pass
206 
207                 
208 
209                
210 
211     
def prepare_request(self):
212 
213         self.lock()
214 
215         self.dbcurr.execute(
'
select * from info where status=0
')
216 
217         result = self.dbcurr.fetchone()
218 
219         
if result:
220 
221             id,word,status,page_num,left_num,how_many=result
222 
223             self.request_queue.put((id,word,page_num)) 
224 
225             
if page_num==0 
and left_num==0 
and how_many==0:
226 
227                 url=self.format_keyword_url(word)
228 
229                 self.generateSeed(url)
230 
231                 html=
""
232 
233                 
try:
234 
235                     url=self.format_top_url(word, page_num, 24)
236 
237                     html = self.browser.openurl(url).read()
238 
239                 
except Exception as err:
240 
241                     
print 
"
err
",err
242 
243                     
#
pass
244 
245                 
if html!=
"":
246 
247                     how_many=self.how_many(html)
248 
249                     
print 
"
how_many
",how_many
250 
251                     
if how_many==None:
252 
253                         how_many=0
254 
255                     t=math.ceil(how_many/24*100) 
#
只要前1/100即可
256 
257                     num = int(t)
258 
259                     
for i  
in xrange(0,num-1):
260 
261                         self.dbcurr.execute(
'
INSERT INTO info(word,status,page_num,left_num,how_many) VALUES(%s,%s,%s,%s,%s)
',(word,0,i*24,num-i,how_many))
262 
263                     self.dbcurr.execute(
'
update info SET status=1 WHERE id=%s
',(id)) 
#
置为已经访问
264 
265                     self.dbconn.commit()
266 
267         self.unlock()
268 
269                 
270 
271             
272 
273     
def start_work(self,req_max):
274 
275         
for item 
in xrange(req_max):
276 
277             t = threading.Thread(target=self.work, args=(item,))
278 
279             t.setDaemon(True)
280 
281             t.start()
282 
283             
284 
285     
def lock(self): 
#
加锁
286 
287         self.mutex.acquire()
288 
289  
290 
291     
def unlock(self): 
#
解锁
292 
293         self.mutex.release()
294 
295  
296 
297     
def get_para(self,url,key):
298 
299         values = url.split(
'
?
')[-1]
300 
301         
for key_value 
in values.split(
'
&
'):
302 
303             value=key_value.split(
'
=
')
304 
305             
if value[0]==key:
306 
307                 
return value[1]
308 
309         
return None  
310 
311     
312 
313     
def makeDateFolder( self,par,child):
314 
315         
#
self.lock()
316 
317         
if os.path.isdir( par ):
318 
319             path=par + 
'
//
' + GetDateString()
320 
321             newFolderName = path+
'
//
'+child
322 
323             
if 
not os.path.isdir(path):
324 
325                 os.mkdir(path)
326 
327             
if 
not os.path.isdir( newFolderName ):
328 
329                 os.mkdir( newFolderName )
330 
331             
return newFolderName
332 
333         
else:
334 
335             
return par 
336 
337         
#
self.unlock()
338 
339         
340 
341     
def parse_json(self,data):
342 
343         
344 
345         ipdata = json.loads(data)
346 
347         
try:
348 
349             
if ipdata[
'
imgs
']:  
350 
351                 
for n 
in ipdata[
'
imgs
']: 
#
data子项 
352 
353                     
if n[
'
objURL
']:  
354 
355                         
try:
356 
357                             proxy_support = urllib2.ProxyHandler(proxy)
358 
359                             opener = urllib2.build_opener(proxy_support)
360 
361                             urllib2.install_opener(opener)
362 
363                             
#
print "proxy",proxy
364 
365                             self.lock()
366 
367                             self.dbcurr.execute(
'
select ID from pic_info where objURL=%s
', (n[
'
objURL
']))
368 
369                             y = self.dbcurr.fetchone()
370 
371                             
#
print "y=",y
372 
373                             
if y:
374 
375                                 
print 
"
database exist
"
376 
377                                 self.unlock() 
#
continue 前解锁
378 
379                                 
continue
380 
381                             
else:
382 
383                                 real_extension=utils.get_extension(n[
'
objURL
'])
384 
385                                 req = urllib2.Request(n[
'
objURL
'],headers=i_headers)
386 
387                                 resp = urllib2.urlopen(req,None,5)
388 
389                                 dataimg=resp.read()
390 
391                                 name=str(uuid.uuid1())
392 
393                                 filename=
""
394 
395                                 
if len(real_extension)>4:
396 
397                                     real_extension=
"
.gif
"
398 
399                                 real_extension=real_extension.lower()
400 
401                                 
if real_extension==
"
.gif
":
402 
403                                     filename  =self.makeDateFolder(
"
E://sosogif
"
"
d
"+str(self.count % 60))+
"
//
"+name+
"
-www.sosogif.com-搜搜gif贡献
"+real_extension
404 
405                                     self.count+=1
406 
407                                 
else:
408 
409                                     filename  =self.makeDateFolder(
"
E://sosogif
"
"
o
"+str(self.count % 20))+
"
//
"+name+
"
-www.sosogif.com-搜搜gif贡献
"+real_extension
410 
411                                     self.count+=1
412 
413                                 
"""
414 
415 
                                name=str(uuid.uuid1())
416 
417 
                                filename=""
418 
419 
                                if len(real_extension)>4:
420 
421 
                                    real_extension=".gif"
422 
423 
                                filename  =self.makeDateFolder("E://sosogif", "d"+str(self.count % 60))+"//"+name+"-www.sosogif.com-搜搜gif贡献"+real_extension
424 
425 
                                self.count+=1 
426 
427 
                                
"""
428 
429                                 
try
430 
431                                     
if 
not os.path.exists(filename): 
432 
433                                         file_object = open(filename,
'
w+b
')  
434 
435                                         file_object.write(dataimg)  
436 
437                                         file_object.close()
438 
439                                         self.anaylis_info(n,filename,real_extension) 
#
入库操作
440 
441                                     
else:
442 
443                                         
print 
"
file exist
" 
444 
445                                 
except IOError,e1:  
446 
447                                     
print 
"
e1=
",e1
448 
449                                     
pass
450 
451                             self.unlock()
452 
453                         
except IOError,e2:  
454 
455                             
#
print "e2=",e2 
456 
457                             
pass  
458 
459                             self.chance1+=1
460 
461         
except Exception as parse_error:
462 
463             
print 
"
parse_error
",parse_error
464 
465             
pass
466 
467     
468 
469     
def title_dealwith(self,title):
470 
471         
472 
473         
#
print "title",title
474 
475         a=title.find(
"
<strong>
")
476 
477         temp1=title[0:a]
478 
479         b=title.find(
"
</strong>
")
480 
481         temp2=title[a+8:b]
482 
483         temp3=title[b+9:len(title)]
484 
485         
return (temp1+temp2+temp3).strip()
486 
487         
488 
489     
def anaylis_info(self,n,filename,real_extension):
490 
491         
print 
"
success.
"
492 
493         
494 
495         
#
if self.wait_ana_queue.qsize()!=0:
496 
497             
#
n,filename,real_extension=self.wait.ana_queue.get()
498 
499         
#
self.lock()
500 
501         objURL=n[
'
objURL
'
#
图片地址
502 
503         fromURLHost=n[
'
fromURLHost
'
#
来源网站
504 
505         width=n[
'
width
']  
#
宽度
506 
507         height=n[
'
height
'
#
高度
508 
509         di=n[
'
di
'
#
用来唯一标识
510 
511         type=n[
'
type
'
#
格式
512 
513         fromPageTitle=n[
'
fromPageTitle
'
#
来自网站
514 
515         keyword=self.title_dealwith(fromPageTitle)
516 
517         cs=n[
'
cs
'
#
未知
518 
519         os=n[
'
os
'
#
未知
520 
521         temp = time.time()
522 
523         x = time.localtime(float(temp))
524 
525         acTime = time.strftime(
"
%Y-%m-%d %H:%M:%S
",x) 
#
爬取时间
526 
527         self.dbcurr.execute(
'
select ID from pic_info where cs=%s
', (cs))
528 
529         y = self.dbcurr.fetchone()
530 
531         
if 
not y:
532 
533             
print 
'
add pic
',filename
534 
535             self.commit_count+=1
536 
537             self.dbcurr.execute(
'
INSERT INTO pic_info(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
',(objURL,fromURLHost,width,height,di,type,keyword,cs,os,acTime,filename,real_extension))
538 
539             
if self.commit_count==10:
540 
541                 self.dbconn.commit()
542 
543                 self.commit_count=0
544 
545         
#
self.unlock()
546 
547            
548 
549  
550 
551     
def format_top_url(self,word,pn,rn):
552 
553  
554 
555         url = TOP_URL.format(word=word, pn=pn,rn=rn).encode(
'
utf-8
'
556 
557         
return url
558 
559  
560 
561     
def how_many(self,data):
562 
563         
try:
564 
565             ipdata = json.loads(data)
566 
567             
if ipdata[
'
displayNum
']>0:
568 
569                 how_many=ipdata[
'
displayNum
']
570 
571                 
return int(how_many)
572 
573             
else:
574 
575                 
return 0
576 
577         
except Exception as e:
578 
579             
pass
580 
581         
582 
583     
def get_pic(self):
584 
585         
"""
586 
587 
        word="gif"
588 
589 
        pn=0
590 
591 
        rn=24
592 
593 
        if self.key_word_queue.qsize()!=0:
594 
595 
            word,pn,rn=self.key_word_queue.get()
596 
597 
        url=self.format_top_url(word,pn,rn)
598 
599 
        global proxy
600 
601 
        if url:
602 
603 
            try:
604 
605 
                html=""
606 
607 
                try:
608 
609 
                    req = urllib2.Request(url,headers=i_headers)
610 
611 
                    response = urllib2.urlopen(req, None,5)
612 
613 
                    #print "url",url
614 
615 
                    html = self.browser.openurl(url).read()
616 
617 
                except Exception as err:
618 
619 
                    print "err",err
620 
621 
                    #pass
622 
623 
                if html:
624 
625 
                    how_many=self.how_many(html)
626 
627 
                    #how_many=10000
628 
629 
                    print "how_many",how_many
630 
631 
                    word=self.get_para(url,"word")
632 
633 
                    rn=int(self.get_para(url,"rn"))
634 
635 
                    t=math.ceil(how_many/rn)
636 
637 
                    num = int(t)
638 
639 
                    for item  in xrange(0,num-1):
640 
641 
        
"""
642 
643         
try:
644 
645             
global proxy
646 
647             
print 
"
size of queue
",self.request_queue.qsize()
648 
649             
if self.request_queue.qsize()!=0:
650 
651                 id,word,page_num = self.request_queue.get()            
652 
653                 u=self.format_top_url(word,page_num,24)
654 
655                 self.lock()
656 
657                 self.dbcurr.execute(
'
update info SET status=1 WHERE id=%s
',(id))
658 
659                 self.dbconn.commit()
660 
661                 
if self.chance >0 
or self.chance1>1: 
#
任何一个出问题都给换代理
662 
663                     
if self.ID % 100==0:
664 
665                         self.dbcurr.execute(
"
select count(*) from proxy
")
666 
667                         
for r 
in self.dbcurr:
668 
669                             count=r[0]
670 
671                         
if self.ID>count:
672 
673                             self.ID=50
674 
675                     self.dbcurr.execute(
"
select * from proxy where ID=%s
",(self.ID))
676 
677                     results = self.dbcurr.fetchall()
678 
679                     
for r 
in results:
680 
681                         protocol=r[1]
682 
683                         ip=r[2]
684 
685                         port=r[3]
686 
687                         pro=(protocol,ip+
"
:
"+port)
688 
689                         
if pro 
not 
in self.next_proxy_set:
690 
691                             self.next_proxy_set.add(pro)
692 
693                     self.chance=0
694 
695                     self.chance1=0
696 
697                     self.ID+=1
698 
699                 self.unlock() 
700 
701                 proxy_support = urllib2.ProxyHandler(proxy)
702 
703                 opener = urllib2.build_opener(proxy_support)
704 
705                 urllib2.install_opener(opener)
706 
707                 html=
""
708 
709                 
try:
710 
711                     req = urllib2.Request(u,headers=i_headers)
712 
713                     
#
print "u=",u
714 
715                     response = urllib2.urlopen(req, None,5)
716 
717                     html = response.read()
718 
719                     
if html:
720 
721                         
#
print "html",type(html)
722 
723                         self.parse_json(html)
724 
725                 
except Exception as ex1:
726 
727                     
#
print "error=",ex1
728 
729                     
pass
730 
731                     self.chance+=1
732 
733                     
if self.chance>0 
or self.chance1>1:
734 
735                         
if len(self.next_proxy_set)>0:
736 
737                             protocol,socket=self.next_proxy_set.pop()
738 
739                             proxy= {protocol:socket}
740 
741                             
print 
"
change proxy finished<<
",proxy,self.ID
742 
743         
except Exception as e:
744 
745             
print 
"
error1
",e
746 
747             
pass
748 
749             
750 
751 
if 
__name__ == 
'
__main__
':
752 
753  
754 
755     app = BaiduImage() 
756 
757     app.start_work(80)
758 
759     
#
app.generateSeed()
760 
761     
while 1:
762 
763         
pass

 

 

 

 

 

 

 

 

若转载请注明出处!若有疑问,请回复交流!
你可能感兴趣的文章
Nginx内置变量以及日志格式变量参数详解
查看>>
python基础-RE正则表达式
查看>>
Linux下查看Nginx安装目录、版本号信息?
查看>>
MVC ---- 标准查询运算符
查看>>
巧用“搜索”解决自学编程遇到的难题
查看>>
移动端布局解决方案+神器
查看>>
pom.xml详解
查看>>
最全的maven的pom.xml文件详解
查看>>
android图片特效处理之怀旧效果
查看>>
微信门店小程序怎样创建 门店小程序创建方法简介
查看>>
深度优先搜索、广度优先搜索
查看>>
代码运行时间 检测锁及死锁详细信息,及sql语句 平台转化
查看>>
Linux简单了解
查看>>
Android中3种全屏方法及3种去掉标题栏的方法
查看>>
使用JAVA开发微信公众平台(一)——环境搭建与开发接入
查看>>
es6 解构赋值
查看>>
SVN导入maven项目
查看>>
怎样彻底解决支付系统资金账户高并发引起的表死锁问题?
查看>>
sharpkeys键盘按键重映射
查看>>
使用spring-boot-maven-plugin打包
查看>>