SoFunction
Updated on 2024-10-28

Python implementation of the Sina blog backup method

In this article, the example of Python to achieve the method of Sina blog backup. Shared for your reference, as follows:

Python version 2.7.2 implementation, recommended to run in an IDE.

# -*- coding:UTF-8 -*- #
'''
Created on 2011-12-18
@author: Ahan
'''
import re
import sys
import os
import time
import socket
import locale
import datetime
import codecs
from urllib import urlopen
# Regular expression definitions
#Match blog post directory links
pattern1=u"""<a href="(http:.*?)">blog post catalog</a>"""
prog1 = (pattern1)
#Matching blog post title links
pattern2=u"""<a title="(.*?)" target="_blank" href="(.*?)">.*?</a>"""
prog2=(pattern2)
#MatchNextLinks
pattern3=u"""<a href="([^"]+)" title="[^"]+"> next page"""
prog3=(pattern3)
#Match body parts
pattern4=u"""<! --Blog post body begin -->[\s\\\S]*? <! -- end of main text -->"""
prog4=(pattern4)
#Match body image links
pattern5=u"""(src="[^"]+"( real_src ="([^"]+)"))"""
prog5=(pattern5)
def read_date_from_url(url):
  """Returns all data read from the url in Unicode form
  """
  try:
    data = ""
    request = urlopen(url)
    while True:
      s = (1024)
      if not s:
        break
      data += s
    return unicode(data)
  except:
    print 'Error while reading data'
    print "Unexpected error:", sys.exc_info()[0],sys.exc_info()[1]
    return None
  finally:
    if request:
      ()
def save_to_file(url,filename,blog_address):
  """url is the address of the blog post, filename is the name of the file to be saved, default suffix is html
  """
  # Create folder if it doesn't exist
  if (blog_address)==False:
    (blog_address)
  # Remove illegal characters from filenames
  filename=ReplaceBadCharOfFileName(filename)
  file_no=0
  while (blog_address+'/'+filename+'.html')==True:
    filename=filename+'('+file_no.__str__()+')'
    file_no+=1
  text = read_date_from_url(url)
  text=_filter(text)
  #Save images locally
  result=(text)
  i=1
  for pic in result:
    folder=blog_address+'/'+filename+'/'
    pic_name='image'+i.__str__()+'.gif' 
    if (folder)==False:
      (folder)
    try:
      url_file = urlopen(pic[2])
      pic_file = (folder+pic_name,'wb')
      while True:
        s = url_file.read(1024)
        if not s:
          break
        pic_file.write(s)
      pic_file.close()
      url_file.close()
    except:
      print 'Oh, there was a problem saving the image, skip this one...'
      print "Unexpected error:", sys.exc_info()[0],sys.exc_info()[1]
    else:
      print 'Saving the image was successful...'
      #Replace the image address in the body of the text
      text=(pic[0],unicode("src=\"" + filename + "/" + pic_name + "\"" + pic[1]),1)
      i=i+1
  blog_file = (blog_address+'/'+filename+'.html','wb')
  blog_file.write(text)
  blog_file.close()
# Extract the body part of the text
def _filter(t):
  """Extracts the body portion of a text, returning a string in Unicode form
  """
  result=(t)
  if result is not None:
    return u'<html><head></head><body>' + unicode(()) + u'</dody></html>'
  else:
    raise Exception('Oh, there was an error extracting the body ......')
# Remove illegal characters from file names
def ReplaceBadCharOfFileName(filename):
  filename=("&nbsp;","")
  filename=("\\", "")
  filename=("/", "")
  filename=(":", "")
  filename=("*", "")
  filename=("?", "")
  filename=("<", "")
  filename=(">", "")
  filename=("|", "")
  filename=("&","")
  filename=(";","")
  return filename
#main function
if __name__ == '__main__':
  # Preparation phase
  blog_no=1# blog post number
  begin=1#start blog post
  end=0#End blog post
  page=0# page number
  saved=0# of articles successfully saved
  timeout = 60*5# Timeout set to 5 minutes
  (timeout)# Set the timeout for the entire socket layer here. If the socket is used in a subsequent file, you don't need to set it again.
  blog_address=raw_input("Please enter your blog address (just enter the last part, for example if your blog address is /jiangafu, just enter jiangafu):")
  blog_address=blog_address.replace('\r','')
  begin=raw_input('From the first article:')  
  begin=(begin)
  while begin<=0:
    begin=raw_input('Please enter a number greater than 0:')
    begin=(begin)
  end=raw_input('To the end of the first post (enter 0 at the end):')
  end=(end)
  while end<0:
    end=raw_input('Please enter a number greater than or equal to 0:')
    end=(end)
  if end==0:
    print 'Your blog address is:/'+blog_address+', save the first '+begin.__str__()+'Episode to the last blog post'
  else:
    print 'Your blog address is:/'+blog_address+', save the first '+begin.__str__()+' to p. '\
       +end.__str__()+'A blog post'
  starttime = ()
  text=read_date_from_url('/'+blog_address)
  (0.5)
  # Extract the url of the "blog post directory".
  result = (text)
  if result is not None:
    print 'Blog post catalog address:' , (1)
    text=read_date_from_url((1))
    (0.4)
  else:
    print 'Failed to extract blog post directory address'
    # Terminate the program
    ()
  #Find all blog posts on each page, analyze, extract, save
  while True:
    page+=1
    print 'Starting Backup No.' , page , 'Page'
    #Match all blog post addresses on this page
    result=(text)
    #Recurring downloads for each blog post on this page
    for blog in result: 
      if blog_no < begin:
        blog_no += 1
      elif end != 0 and blog_no > end:
        break
      else:
        try:
          save_to_file(blog[1],unicode(blog[0]),blog_address)
        except:
          print 'Oh, save No.',blog_no,'A blog post',blog[0],'There's a problem with skipping...'
          blog_no += 1
          print "Unexpected error:", sys.exc_info()[0],sys.exc_info()[1]
        else:
          print 'Successfully saved the first ', blog_no, 'A blog post:', blog[0]
          blog_no += 1
          saved += 1
          (0.4)
    #Determine if there is a next page
    result = (text)
    if result is not None:
      text = read_date_from_url((1))
    else:
      print 'This is the last page'
      break
  print 'Blog backup complete total backup',saved,'A blog post'
  print 'Shared time:',() - starttime
  raw_input('Press enter to exit...')

Readers interested in more Python related content can check out this site's topic: thePython Data Structures and Algorithms Tutorial》、《Python Socket Programming Tips Summary》、《Summary of Python function usage tips》、《Summary of Python string manipulation techniques》、《Python introductory and advanced classic tutorialsand theSummary of Python file and directory manipulation techniques

I hope that what I have said in this article will help you in Python programming.