1. <em id="yud1w"><acronym id="yud1w"><u id="yud1w"></u></acronym></em>
      
      
      <button id="yud1w"></button>

      python

      当前位置:首页?>?SEO工具?>?当前文章

      SEO工具

      python采集百度PC搜索结果页真实url

      2020-08-24 114赞 python中国网
      每篇文章努力于解决一个问题!python高级、python面试全套、操作系统经典课等可移步文章底部。

        1000个读者有1000个哈姆雷特,1000个seo有1000种需求,采集百度搜索结果页的真实url就是一项。很多SEO工具实现的过程也需要采集真实url这一步,下面的代码就是采集百度PC排名的真实url。

        1、准备关键词文件kwd.txt(一行一个关键词)。

        2、结果会保存成bdpc_real_url.txt。

        3、线程数默认是1,现在百度反爬比之前严重!线程最好是1。【多线程写同一个文件需要加锁否则可能数据错乱】

      # ‐*‐ coding: utf‐8 ‐*‐
      """
      采集百度pc首页排名的真实url
      准备kwd.txt,一行一个词
      线程数自己设,默认2
      """
      import requests
      from pyquery import PyQuery as pq
      import threading
      import queue
      import time
      import gc
      
      
      class BdpcRealUrl(threading.Thread):
      
          def __init__(self):
              threading.Thread.__init__(self)
      
          # 读取txt文件 获取待查询url
          @staticmethod
          def read_txt(filepath):
              q = queue.Queue()
              for url in open(filepath, encoding='utf-8'):
                  url = url.strip()
                  q.put(url)
              return q
      
          # 获取某待查询url的serp源码
          def get_html(self,url,retry=2):
              try:
                  r = requests.get(url=url,headers=my_header,timeout=5)
              except Exception as e:
                  print('获取源码失败',e)
                  time.sleep(6)
                  if retry > 0:
                      self.get_html(url,retry-1)
              else:
                  html = r.content.decode('utf-8',errors='ignore')  # 用r.text有时候识别错误
                  url = r.url  # 反爬会重定向,取定向后的地址
                  return html,url
      
          # 获取某待查询url的serp源码所有排名url
          def get_encrpt_urls(self,html,url):
              encrypt_url_list = []
              doc = pq(html)
              title = doc('title').text()
              if '_百度搜索' in title and 'https://www.baidu.com/s?ie=utf-8' in url:
                  try:
                      a_list = doc('.t a').items()
                  except Exception as e:
                      print('未提取到serp上的解密url', e)
                  else:
                      for a in a_list:
                          encrypt_url = a.attr('href')
                          if encrypt_url.find('http://www.baidu.com/link?url=') == 0:
                              encrypt_url_list.append(encrypt_url)
              else:
                  print(title,'源码异常,可能反爬')
                  time.sleep(100)
              return encrypt_url_list
      
          # 解密某条加密url
          def decrypt_url(self,encrypt_url,retry=1):
              real_url = None # 默认None
              try:
                  encrypt_url = encrypt_url.replace('http://','https://')
                  # print(encrypt_url)
                  r = requests.head(encrypt_url,headers=my_header)
              except Exception as e:
                  print(encrypt_url,'解密失败',e)
                  time.sleep(6)
                  if retry > 0:
                      self.decrypt_url(encrypt_url,retry-1)
              else:
                  real_url = r.headers['Location']
              return real_url
      
          # 获取结果页真实url
          def get_real_urls(self, encrypt_url_list):
              if encrypt_url_list:
                  real_url_list = [self.decrypt_url(encrypt_url) for encrypt_url in encrypt_url_list]
                  return real_url_list
              else:
                  return []
      
          # 线程函数
          def run(self):
              while 1:
                  kwd = q.get()
                  # url带上tn等参数 否则会被反爬
                  url = "https://www.baidu.com/s?ie=utf-8&rsv_bp=1&tn=87048150_dg&wd={0}".format(kwd)
                  try:
                      html,now_url = self.get_html(url)
                      encrypt_url_list = self.get_encrpt_urls(html,now_url)
                      real_urls = self.get_real_urls(encrypt_url_list)
                  except Exception as e:
                      print(e)
                  else:
                      for real_url in real_urls:
                          f.write(real_url + '
      ')
                          print(real_url)
                      f.flush()
                  finally:
                      del kwd
                      gc.collect()
                      q.task_done()
      
      
      if __name__ == "__main__":
      
          start = time.time()
          my_header = {
              'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
              'Cookie':'BIDUPSID=EB1F44AB7896D7EFA4F0FD243C29FF17; PSTM=1567562976; BAIDUID=EB1F44AB7896D7EFA4F0FD243C29FF17:SL=0:NR=10:FG=1; BDUSS=BZWlZuSXpNWmNjM3BTSktnM2xhbGhIdUlqeW1ITEdvclpzSHpIS3p2WUMwc2hkRVFBQUFBJCQAAAAAAAAAAAEAAAAGtiZkNzcyNDgzMjAwZG9uZwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJFoV0CRaFdeF; plus_cv=1::m:49a3f4a6; MSA_WH=400_655; lsv=globalTjs_3a11c3d-globalT_androidcss_4630b37-wwwT_androidcss_c5f9a54-searchboxcss_591d86b-globalBcss_aad48cc-wwwBcss_777000e-framejs_c9ac861-atomentryjs_5cd4b30-globalBjs_99ad350-wwwjs_b674808; BD_UPN=19314353; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; BDICON=10294984.98; delPer=0; BD_CK_SAM=1; rsv_i=c2b6G%2F3avQC%2FfgLjK6Tg5dByzXJGjTHszykjx0XgYlZZgizi3%2F9wOVrzCucTWKLxPYYUs%2BqPpygizpeQMUWhVScLKRxzaaw; FEED_SIDS=732051_1030_14; plus_lsv=f197ee21ffd230fd; Hm_lvt_12423ecbc0e2ca965d84259063d35238=1572225355,1572415847,1572418912; Hm_lpvt_12423ecbc0e2ca965d84259063d35238=1572418912; BAIDULOC=12966109.384666294_4841881.341700486_100_131_1572418911981; SE_LAUNCH=5%3A26206981_0%3A26206981; BDPASSGATE=IlPT2AEptyoA_yiU4VKH3kIN8efjWvW4AfvESkplQFStfCaWmhH3BrUzWz0HSieXBDP6wZTXdMsDxXTqXlVXa_EqnBsZolpOaSaXzKGoucHtVM69-t5yILXoHUE2sA8PbRhL-3MEF2ZELlQvcgjchQZrchW8z3JTpxz1z5Xocc0T1UKR2VLJxJyTS7xvRHvcPNuz94rXnEpKKSmBUADHRVjYcSQyWXkD5NOtjsAm1Q0WrkoXGurSRvAa1G8vJpFeXAio1fWU60ul269v5HViViwh9UOI7u46MnJZ; H_WISE_SIDS=137151_137734_137755_136649_137663_137071_128070_134982_136665_120196_136768_137002_137788_136366_132909_136456_137690_135847_131246_137746_132378_136681_118893_118876_118846_118827_118802_132782_136800_136431_136093_133352_136862_137089_129652_136194_124637_137105_137572_133847_132551_137468_134046_129646_131423_137212_137466_136034_110085_127969_137613_131951_136611_137252_128196_137696_136636_137767_137207_134347_134231_137618_137449; kleck=638cabc3ad33a7a082343c4553a47c42; BDRCVFR[x4e6higC8W6]=mk3SLVN4HKm; PSINO=7; H_PS_PSSID=1440_21084_20697_29567_29220; sug=3; sugstore=0; ORIGIN=0; bdime=0; H_PS_645EC=db34IWhem1lYO7OwXVBPbsx2yQuIu3jmqGT9FUp09TItjsTj8omDTLnov6%2BIZQe6dqc',
              'Host':'www.baidu.com',
              'Upgrade-Insecure-Requests':'1'}
          q = BdpcRealUrl.read_txt('kwd.txt') 
          f = open('bdpc_real_url.txt','w+',encoding='utf-8')
          # 设置线程数
          for i in list(range(1)):
              t = BdpcRealUrl()
              t.setDaemon(True)
              t.start()
          q.join()
          f.flush()
          f.close()
          end = time.time()
          print('耗时{0}min'.format((end - start) / 60))
      
      https://baike.baidu.com/item/%E6%90%9C%E7%B4%A2%E5%BC%95%E6%93%8E%E4%BC%98%E5%8C%96/3132?fromtitle=seo&fromid=102990&fr=aladdin
      https://www.seoqe.com/
      http://www.shangpaiming.com/
      https://www.xminseo.com/
      http://www.sshz.com/
      https://www.seodt.com/
      http://fanyi.baidu.com/?aldtype=85#en/zh/Seo
      https://www.yuntask.com/
      https://www.godaddy.com/online-marketing/seo-tools
      http://www.zhantengwang.com/
      https://baike.baidu.com/item/%E6%90%9C%E7%B4%A2%E5%BC%95%E6%93%8E%E8%90%A5%E9%94%80/9387327?fromtitle=sem&fromid=2554866&fr=aladdin
      http://www.zhihu.com/question/20307058
      http://www.jiuzhilan.com/tag/sem/
      https://baike.baidu.com/item/%E6%89%AB%E6%8F%8F%E7%94%B5%E5%AD%90%E6%98%BE%E5%BE%AE%E9%95%9C/8062773?fr=aladdin
      https://baijiahao.baidu.com/s?id=1595599564161960109&wfr=spider&for=pc
      http://tieba.baidu.com/f?kw=sem&fr=ala0&tpl=5
      http://www.mysemlife.com/
      https://www.niaogebiji.com/pc/article/catlist/?type=article&catid=106
      https://www.semfenxi.com/
      http://fanyi.baidu.com/?aldtype=85#en/zh/Sem
      https://baike.baidu.com/item/PHP/9337?fr=aladdin
      https://www.php.net/
      https://www.runoob.com/php/php-tutorial.html
      http://tieba.baidu.com/f?kw=php&fr=ala0&tpl=5
      https://www.php.net/downloads.php
      https://www.php.cn/
      https://www.w3school.com.cn/php/index.asp
      https://www.baidu.com/s?tn=news&rtt=1&bsst=1&wd=php&cl=2&origin=ps
      https://www.python.org/
      https://www.runoob.com/python/python-tutorial.html
      https://baike.baidu.com/item/Python/407313?fr=aladdin
      https://www.python.org/getit/
      http://www.zhihu.com/topic/19552832
      https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000
      https://www.runoob.com/python/python-intro.html
      https://www.baidu.com/s?tn=news&rtt=1&bsst=1&wd=python&cl=2&origin=ps
      http://baijiahao.baidu.com/s?id=1648241919117346415&wfr=spider&for=pc
      http://www.java.com/
      https://baike.baidu.com/item/Java/85979?fr=aladdin
      https://www.oracle.com/technetwork/java/javase/downloads/
      http://tieba.baidu.com/f?kw=java&fr=ala0&tpl=5
      https://www.java.com/zh_CN/download
      https://www.oracle.com/technetwork/java/index.html
      https://baike.baidu.com/item/java/13130360
      https://www.baidu.com/s?tn=news&rtt=1&bsst=1&wd=java&cl=2&origin=ps
      https://www.jb51.net/list/list_207_1.htm
      https://baike.baidu.com/item/c%E8%AF%AD%E8%A8%80/105958?fromtitle=c&fromid=7252092&fr=aladdin
      https://www.runoob.com/cprogramming/c-tutorial.html
      http://tieba.baidu.com/f?kw=c&fr=ala0&tpl=5
      http://www.chem17.com/
      https://www.autohome.com.cn/3801/
      http://om.cn/
      https://baike.pcauto.com.cn/149.html
      http://c.biancheng.net/c/
      https://baike.baidu.com/item/C-BLOCK/9778228?fr=aladdin
      https://www.runoob.com/cprogramming/c-100-examples.html
      https://baike.baidu.com/item/c%E8%AF%AD%E8%A8%80/105958?fromtitle=c&fromid=7252092&fr=aladdin
      https://www.runoob.com/cprogramming/c-tutorial.html
      http://tieba.baidu.com/f?kw=c&fr=ala0&tpl=5
      http://www.chem17.com/
      https://www.autohome.com.cn/3801/
      http://om.cn/
      https://baike.pcauto.com.cn/149.html
      http://c.biancheng.net/c/
      https://baike.baidu.com/item/C-BLOCK/9778228?fr=aladdin
      https://www.runoob.com/cprogramming/c-100-examples.html
      http://tieba.baidu.com/f?kw=%B1%E0%B3%CC&fr=ala0&tpl=5
      http://ask.csdn.net/
      https://baike.baidu.com/item/%E7%BC%96%E7%A8%8B/139828?fr=aladdin
      https://www.bccn.net/
      http://www.maocode.com/
      http://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%B1%E0%B3%CC&fr=ala&ala=1&alatpl=adress&pos=0&hs=2&xthttps=000000
      http://www.zhihu.com/topic/19554298
      https://baike.baidu.com/item/%E7%BC%96%E7%A8%8B%E8%AF%AD%E8%A8%80/9845131
      https://www.codemao.cn/
      https://www.iplaysoft.com/category/programming
      http://www.bdd33.com/
      http://seo.chinaz.com/www.bdd33.com/
      http://www.juhemulu.com/site.asp?SiteID=10704
      http://www.kuz8.com/v.php?id=6406763
      https://ask.seowhy.com/site/40672
      http://pr.chinaz.com/www.python66.cn
      https://ask.seowhy.com/site/30467
      https://www.cnblogs.com/caoj/p/7815750.html
      https://blog.csdn.net/qq_42127861/article/details/90749741
      https://www.cnblogs.com/xxtalhr/p/10768593.html
      
      


        采集百度PC排名的真实url代码大家复制下来测试一把,有问题反馈哦。

      文章评论

      python采集百度PC搜索结果页真实url文章写得不错,值得赞赏
      国产99视频精品免视看6