Python book information crawler example

Python book information crawler examples for your reference. The specific content is as follows

Background description

You need to collect some book information, use Douban book entries as the source, obtain some valid book information, and save it to the local database.

Get book classification tags

For details, please refer to this link:
/tag/?view=type

Then save these classification tag links to a local file, and store the content as follows

/tag/novel
/tag/Foreign Literature
/tag/literature
/tag/Essay
/tag/中国literature
/tag/classic
/tag/日本literature
/tag/prose
/tag/Haruki Murakami
/tag/Poetry
/tag/fairy tale
......

Get book information and save local database

Assuming that the mysql table has been built, as follows:

CREATE TABLE `book_info` (
 `id` int(11) NOT NULL AUTO_INCREMENT,
 `bookid` varchar(64) NOT NULL COMMENT 'book ID',
 `tag` varchar(32) DEFAULT '' COMMENT 'Category',
 `bookname` varchar(256) NOT NULL COMMENT 'Title of the book',
 `subname` varchar(256) NOT NULL COMMENT 'Class 2 Book Title',
 `author` varchar(256) DEFAULT '' COMMENT 'author',
 `translator` varchar(256) DEFAULT '' COMMENT 'Translator',
 `press` varchar(128) DEFAULT '' COMMENT 'Publishing House',
 `publishAt` date DEFAULT '0000-00-00' COMMENT 'Publication date',
 `stars` float DEFAULT '0' COMMENT 'score',
 `price_str` varchar(32) DEFAULT '' COMMENT 'pricestring',
 `hotcnt` int(11) DEFAULT '0' COMMENT 'Number of comments',
 `bookdesc` varchar(8192) DEFAULT NULL COMMENT 'Introduction',
 `updateAt` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'Modified date',
 PRIMARY KEY (`id`),
 UNIQUE KEY `idx_bookid` (`bookid`),
 KEY `idx_bookname` (`bookname`),
 KEY `hotcnt` (`hotcnt`),
 KEY `stars` (`stars`),
 KEY `idx_tag` (`tag`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='Book Information';

It has also implemented related crawler logic, mainly using the BeautifulSoup package, as follows:

#!/usr/bin/python
# coding: utf-8

import re
import logging
import requests
import pymysql
import random
import time
import datetime
from hashlib import md5
from bs4 import BeautifulSoup

(level=,
     format='[%(levelname)s][%(name)s][%(asctime)s]%(message)s',
     datefmt='%Y-%m-%d %H:%M:%S')

class DestDB:
 Host = "192.168.1.10"
 DB = "spider"
 Table = "book_info"
 User = "test"
 Pwd = "123456"

def connect_db(host, db, user, pwd):
 conn = (
  host=host,
  user=user,
  passwd=pwd,
  db=db,
  charset='utf8',
  connect_timeout=3600) #,
#  cursorclass=)
 (True)
 return conn

def disconnect_db(conn, cursor):
 ()
 ()

#Extract the number of people evaluated. If the number of people evaluated is less than 10, the number of people will be processed as 10 people.def hotratings(person):
 try:
  ptext = person.get_text().split()[0]
  pc = int(ptext[1:len(ptext)-4])
 except ValueError:
  pc = int(10)
 return pc

# Persist to the databasedef save_to_db(tag, book_reslist):
 dest_conn = connect_db(, , , )
 dest_cursor = dest_conn.cursor()

 isql = "insert ignore into book_info "
 isql += "(`bookid`,`tag`,`author`,`translator`,`bookname`,`subname`,`press`,"
 isql += "`publishAt`,`price_str`,`stars`,`hotcnt`,`bookdesc`) values "
 isql += ",".join(["(%s)" % ",".join(['%s']*12)]*len(book_reslist))

 values = []
 for row in book_reslist:
  # Temporarily use md5(bookname+author) as the only bookid reference  bookid = md5(("%s_%s"%(row[0],row[2])).encode('utf-8')).hexdigest()
  ([bookid, tag]+row[:10])

 dest_cursor.execute(isql, tuple(values))
 disconnect_db(dest_conn, dest_cursor)

# Process every visit to the pagedef do_parse(tag, url):
 page_data = (url)
 soup = BeautifulSoup(page_data.("utf-8"), "lxml")
 # Extract tag information tag = ("?")[0].split("/")[-1]
 # Crawl the author and publisher information details = ("#subject_list &gt; ul &gt; li &gt;  &gt; ")
 # Crawl rating scores = ("#subject_list &gt; ul &gt; li &gt;  &gt;  &gt; span.rating_nums")
 # Crawl the number of people evaluating persons = ("#subject_list &gt; ul &gt; li &gt;  &gt;  &gt; ")
 # Grab the title of the book booknames = ("#subject_list &gt; ul &gt; li &gt;  &gt; h2 &gt; a")
 #Crawl introduction descs = ("#subject_list &gt; ul &gt; li &gt;  &gt; p")
 # Separate content from tag information book_reslist = []
 for detail, score, personCnt, bookname, desc in zip(details, scores, persons, booknames, descs):
  try:
   subtitle = ""
   title_strs = [('\n', '').strip() for s in ]
   title_strs = [s for s in title_strs if s]
   # Some books have secondary titles   if not title_strs:
    continue
   elif len(title_strs) &gt;= 2:
    bookname, subtitle = title_strs[:2]
   else:
    bookname = title_strs[0]

   # Number of ratings   hotcnt = hotratings(personCnt)
   desc = desc.get_text()
   stars = float('%.1f' % float(score.get_text() if score.get_text() else "-1"))

   author, translator, press, publishAt, price = [""]*5
   detail_texts = detail.get_text().replace('\n', '').split("/")
   detail_texts = [() for s in detail_texts]

   # Some books have no information on translator   if len(detail_texts) == 4:
    author, press, publishAt, price = detail_texts[:4]
   elif len(detail_texts) &gt;= 5:
    author, translator, press, publishAt, price = detail_texts[:5]
   else:
    continue

   # Convert publication date to date type   if ('^[\d]{4}-[\d]{1,2}', publishAt):
    dts = ('-')
    publishAt = (int(dts[0]), int(dts[1]), 1)
   else:
    publishAt = (1000, 1, 1)

   book_reslist.append([author, translator, bookname, subtitle, press, 
         publishAt, price, stars, hotcnt, desc])
  except Exception as e:
   (e)

 ("insert count: %d" % len(book_reslist))
 if len(book_reslist) &gt; 0:
  save_to_db(tag, book_reslist)
  book_reslist = []
 return len(details)

def main():
 with open("book_tags.txt") as fd:
  tags = ()
  for tag in tags:
   tag = ()
   ("current tag url: %s" % tag)
   for idx in range(0, 1000000, 20):
    try:
     url = "%s?start=%d&amp;type=T" % ((), idx)
     cnt = do_parse(('/')[-1], url)
     if cnt &lt; 10:
      break
     # Sleep for several seconds, reduce access frequency     ((10, 15))
    except Exception as e:
     ("outer_err: %s" % e)
   (300)

if __name__ == "__main__":
 main()

summary

The above code is run based on the python3 environment;
BeautifulSoup needs to be installed first: pip install bs4
During the crawling process, you need to control the access frequency;
Some information needs to be handled abnormally, such as translator information, number of commenters, etc.

The above is all the content of this article. I hope it will be helpful to everyone's study and I hope everyone will support me more.