• 技术文章 >Python技术 >Python爬虫

    看kindle网站电子书用Python爬取下载

    PythonPython2019-05-31 14:20:03原创6358
    一个下载看kindle(kankindle.com)的所有电子书的python脚本,程序会自动下载首页部分13页的所有电子书,下载到ebook目录下,程序会检测是否下载过。

    #!/usr/bin/env python
    # coding=utf-8
    from bs4 import BeautifulSoup
    import urllib2
    import socket
    import re
    import unicodedata
    import os
    from urwid.text_layout import trim_line
    def download(url):
        print 'starting download %s' % url
        response=urllib2.urlopen(url,timeout=30)
        html_data=response.read()
        
        soup=BeautifulSoup(html_data)
        print 'start to analayse---------------'
        
        
        title_soup=soup.find_all(class_='yanshi_xiazai')
        name_soup = soup.find_all('h1')
        tag_a = title_soup[0].a.attrs['href']
        tag_name= title_soup[0].a.contents
        link_name = name_soup[0]
        link_name = str(link_name).replace("<h1>","").replace("</h1>","")
        #print tag_name[0]
        #print link_name
        
        
        filename = link_name+".mobi"
        filename = "ebook/"+filename
        print 'filename is :%s' % filename
        
        print "downloading with urllib2 %s" % tag_a
        if os.path.exists(filename):
            print 'already donwload ,ignore'
        else:
            try:
                f = urllib2.urlopen(tag_a,timeout=60)
                data = f.read()
                #print 'the data is %s'% data
                with open(filename, "wb") as code:
                    code.write(data)
            except Exception,e:
                print e
    def get_all_link(url):
        print 'Starting get all the list'
        response=urllib2.urlopen(url,timeout=30)
        html_data=response.read()
        #print html_data
        
        soup=BeautifulSoup(html_data)
        link_soup = soup.find_all('a')
        #print link_soup
       
        for each_link in link_soup:
            if re.search('view',str(each_link)):
                #print each_link
                print each_link
                print each_link.attrs['href']
                download(each_link.attrs['href'])
    if __name__ == '__main__':
        for page in range(1,13):
            url = "http://kankindle.com/simple/page/3"+str(page)
            url = url.strip()
            print url
            get_all_link(url)
    专题推荐:python
    上一篇:没有了 下一篇:Python中实现URL的解析

    相关文章推荐

    全部评论我要评论

    © 2021 Python学习网 苏ICP备2021003149号-1

  • 取消发布评论
  • 

    Python学习网