• 技术文章 >Python技术 >Python爬虫

    看kindle网站电子书用Python爬取下载

    PythonPython2019-05-31 14:20:03原创6879
    一个下载看kindle(kankindle.com)的所有电子书的python脚本,程序会自动下载首页部分13页的所有电子书,下载到ebook目录下,程序会检测是否下载过。

    1

    2

    3

    4

    5

    6

    7

    8

    9

    10

    11

    12

    13

    14

    15

    16

    17

    18

    19

    20

    21

    22

    23

    24

    25

    26

    27

    28

    29

    30

    31

    32

    33

    34

    35

    36

    37

    38

    39

    40

    41

    42

    43

    44

    45

    46

    47

    48

    49

    50

    51

    52

    53

    54

    55

    56

    57

    58

    59

    60

    61

    62

    63

    64

    65

    66

    #!/usr/bin/env python

    # coding=utf-8

    from bs4 import BeautifulSoup

    import urllib2

    import socket

    import re

    import unicodedata

    import os

    from urwid.text_layout import trim_line

    def download(url):

        print 'starting download %s' % url

        response=urllib2.urlopen(url,timeout=30)

        html_data=response.read()

         

        soup=BeautifulSoup(html_data)

        print 'start to analayse---------------'

         

         

        title_soup=soup.find_all(class_='yanshi_xiazai')

        name_soup = soup.find_all('h1')

        tag_a = title_soup[0].a.attrs['href']

        tag_name= title_soup[0].a.contents

        link_name = name_soup[0]

        link_name = str(link_name).replace("<h1>","").replace("</h1>","")

        #print tag_name[0]

        #print link_name

         

         

        filename = link_name+".mobi"

        filename = "ebook/"+filename

        print 'filename is :%s' % filename

         

        print "downloading with urllib2 %s" % tag_a

        if os.path.exists(filename):

            print 'already donwload ,ignore'

        else:

            try:

                f = urllib2.urlopen(tag_a,timeout=60)

                data = f.read()

                #print 'the data is %s'% data

                with open(filename, "wb") as code:

                    code.write(data)

            except Exception,e:

                print e

    def get_all_link(url):

        print 'Starting get all the list'

        response=urllib2.urlopen(url,timeout=30)

        html_data=response.read()

        #print html_data

         

        soup=BeautifulSoup(html_data)

        link_soup = soup.find_all('a')

        #print link_soup

        

        for each_link in link_soup:

            if re.search('view',str(each_link)):

                #print each_link

                print each_link

                print each_link.attrs['href']

                download(each_link.attrs['href'])

    if __name__ == '__main__':

        for page in range(1,13):

            url = "http://kankindle.com/simple/page/3"+str(page)

            url = url.strip()

            print url

            get_all_link(url)

    专题推荐:python
    上一篇:没有了 下一篇:Python中实现URL的解析

    相关文章推荐

    全部评论我要评论

    © 2021 Python学习网 苏ICP备2021003149号-1

  • 取消发布评论
  • 

    Python学习网