• 技术文章 >Python技术 >Python爬虫

    python爬虫的分类方法

    小妮浅浅小妮浅浅2021-09-02 11:58:28原创137

    说到爬虫,是我们在网络中经常获取数据的手段,不过在其分类上,还有一些概念上的模糊,本篇为大家带来两种分类的介绍。

    1、根据目的可以分为功能性爬虫和数据增量爬虫。

    2、根据url地址和对应的页面内容是否改变,数据增量爬虫可分为地址变内容也变的爬虫和地址不变内容变的爬虫。

    实例

    # 1.spider文件
     
    import scrapy
    from movieAddPro.items import MovieaddproItem
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from redis import Redis
     
    class MovieaddSpider(CrawlSpider):
        name = 'movieadd'
        # allowed_domains = ['www.xxx.com']
     
        start_urls = ['https://www.4567tv.tv/frim/index1.html']
     
        link = LinkExtractor(allow=r'.frim/index1-\d+.html')
        rules = (
            Rule(link, callback='parse_item', follow=True),
        )
        
        # 创建reids连接对象
        conn = Redis(host='127.0.0.1',port=6379)
        # 解析电影的名称和详情页的url
        def parse_item(self, response):
            li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
            for li in li_list:
                title = li.xpath('./div/a/@title').extract_first()
                # 获取详情页url
                detail_url = 'https://www.4567tv.tv' + li.xpath('./div/a/@href').extract_first()
                item = MovieaddproItem()
                item['title'] = title
     
                # 判断该详情页的url是否进行请求发送
                ex = self.conn.sadd('movieadd_detail_urls',detail_url)
                if ex == 1: # 说明detail_url之前不存在redis的set集合中,需要发送请求
                    print('已有新数据更新,正在爬取数据......')
                    yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta={'item':item})
                else:
                    print('暂无新数据更新......')
     
        def parse_detail(self,response):
            item = response.meta['item']
            desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[3]/text()').extract_first()
            item['desc'] = desc
     
            yield item
    --------------------------------------------------------------------------------
    # 2.pipelines文件
     
    class MovieaddproPipeline(object):
     
        def process_item(self, item, spider):
            dic = {
                'title':item['title'],
                'desc':item['desc']
            }
            print(dic)
            
            conn = spider.conn
     
            conn.lpush('movieadd_data',dic)
            return item
    --------------------------------------------------------------------------------
    # 3.items文件
     
    import scrapy
     
    class MovieaddproItem(scrapy.Item):
        title = scrapy.Field()
        desc = scrapy.Field()
    --------------------------------------------------------------------------------
    # 4.setting文件
     
    BOT_NAME = 'movieAddPro'
     
    SPIDER_MODULES = ['movieAddPro.spiders']
    NEWSPIDER_MODULE = 'movieAddPro.spiders'
     
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
     
    ROBOTSTXT_OBEY = False
     
    LOG_LEVEL = 'ERROR'
     
    ITEM_PIPELINES = {
       'movieAddPro.pipelines.MovieaddproPipeline': 300,
    }
    - 需求:爬取糗事百科中的段子和作者数据。
     
    # 1.spider文件
     
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from incrementByDataPro.items import IncrementbydataproItem
    from redis import Redis
    import hashlib
     
    class QiubaiSpider(CrawlSpider):
        name = 'qiubai'
        start_urls = ['https://www.qiushibaike.com/text/']
     
        rules = (
            Rule(LinkExtractor(allow=r'/text/page/\d+/'), callback='parse_item', follow=True),
            Rule(LinkExtractor(allow=r'/text/$'), callback='parse_item', follow=True),
        )
        #创建redis链接对象
        conn = Redis(host='127.0.0.1',port=6379)
        def parse_item(self, response):
            div_list = response.xpath('//div[@id="content-left"]/div')
     
            for div in div_list:
                item = IncrementbydataproItem()
                item['author'] = div.xpath('./div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()').extract_first()
                item['content'] = div.xpath('.//div[@class="content"]/span/text()').extract_first()
     
                #将解析到的数据值生成一个唯一的标识进行redis存储
                source = item['author']+item['content']
                source_id = hashlib.sha256(source.encode()).hexdigest()
                #将解析内容的唯一表示存储到redis的data_id中
                ex = self.conn.sadd('data_id',source_id)
     
                if ex == 1:
                    print('该条数据没有爬取过,可以爬取......')
                    yield item
                else:
                    print('该条数据已经爬取过了,不需要再次爬取了!!!')
    --------------------------------------------------------------------------------
    # 2.pipelines文件     
     
    from redis import Redis
    class IncrementbydataproPipeline(object):
        conn = None
     
        def open_spider(self, spider):
            self.conn = Redis(host='127.0.0.1', port=6379)
     
        def process_item(self, item, spider):
            dic = {
                'author': item['author'],
                'content': item['content']
            }
            print(dic)
            self.conn.lpush('qiubaiData', dic)
            return item

    以上就是python爬虫的分类方法,希望对大家有所帮助。更多Python学习指路:python爬虫

    专题推荐:python爬虫
    品易云
    上一篇:Python爬虫架构的组成 下一篇:python爬虫获取数据的步骤

    相关文章推荐

    • python爬虫中URLError是什么• python爬虫之HTTPError如何理解• python爬虫urlparse方法如何使用• python爬虫之urlunparse()方法怎么用• python爬虫urlsplit()方法如何使用• python爬虫urljoin()生成链接• python爬虫urlencode()方法是什么• python爬虫quote()方法有什么用• python爬虫中aiohttp是什么• 百万数据python爬虫技巧• 如何解决Python爬虫中的代理ip异常和超时问题• python爬虫怎样添加IP池?• python爬虫更适合什么样的IP切换工具?• python爬虫怎么使用代理ip• Python爬虫架构的组成

    全部评论我要评论

    © 2021 Python学习网 苏ICP备2021003149号-1

  • 取消发布评论
  • 

    Python学习网