In this article, the example of Python to achieve the method of Sina blog backup. Shared for your reference, as follows:
Python version 2.7.2 implementation, recommended to run in an IDE.
# -*- coding:UTF-8 -*- # ''' Created on 2011-12-18 @author: Ahan ''' import re import sys import os import time import socket import locale import datetime import codecs from urllib import urlopen # Regular expression definitions #Match blog post directory links pattern1=u"""<a href="(http:.*?)">blog post catalog</a>""" prog1 = (pattern1) #Matching blog post title links pattern2=u"""<a title="(.*?)" target="_blank" href="(.*?)">.*?</a>""" prog2=(pattern2) #MatchNextLinks pattern3=u"""<a href="([^"]+)" title="[^"]+"> next page""" prog3=(pattern3) #Match body parts pattern4=u"""<! --Blog post body begin -->[\s\\\S]*? <! -- end of main text -->""" prog4=(pattern4) #Match body image links pattern5=u"""(src="[^"]+"( real_src ="([^"]+)"))""" prog5=(pattern5) def read_date_from_url(url): """Returns all data read from the url in Unicode form """ try: data = "" request = urlopen(url) while True: s = (1024) if not s: break data += s return unicode(data) except: print 'Error while reading data' print "Unexpected error:", sys.exc_info()[0],sys.exc_info()[1] return None finally: if request: () def save_to_file(url,filename,blog_address): """url is the address of the blog post, filename is the name of the file to be saved, default suffix is html """ # Create folder if it doesn't exist if (blog_address)==False: (blog_address) # Remove illegal characters from filenames filename=ReplaceBadCharOfFileName(filename) file_no=0 while (blog_address+'/'+filename+'.html')==True: filename=filename+'('+file_no.__str__()+')' file_no+=1 text = read_date_from_url(url) text=_filter(text) #Save images locally result=(text) i=1 for pic in result: folder=blog_address+'/'+filename+'/' pic_name='image'+i.__str__()+'.gif' if (folder)==False: (folder) try: url_file = urlopen(pic[2]) pic_file = (folder+pic_name,'wb') while True: s = url_file.read(1024) if not s: break pic_file.write(s) pic_file.close() url_file.close() except: print 'Oh, there was a problem saving the image, skip this one...' print "Unexpected error:", sys.exc_info()[0],sys.exc_info()[1] else: print 'Saving the image was successful...' #Replace the image address in the body of the text text=(pic[0],unicode("src=\"" + filename + "/" + pic_name + "\"" + pic[1]),1) i=i+1 blog_file = (blog_address+'/'+filename+'.html','wb') blog_file.write(text) blog_file.close() # Extract the body part of the text def _filter(t): """Extracts the body portion of a text, returning a string in Unicode form """ result=(t) if result is not None: return u'<html><head></head><body>' + unicode(()) + u'</dody></html>' else: raise Exception('Oh, there was an error extracting the body ......') # Remove illegal characters from file names def ReplaceBadCharOfFileName(filename): filename=(" ","") filename=("\\", "") filename=("/", "") filename=(":", "") filename=("*", "") filename=("?", "") filename=("<", "") filename=(">", "") filename=("|", "") filename=("&","") filename=(";","") return filename #main function if __name__ == '__main__': # Preparation phase blog_no=1# blog post number begin=1#start blog post end=0#End blog post page=0# page number saved=0# of articles successfully saved timeout = 60*5# Timeout set to 5 minutes (timeout)# Set the timeout for the entire socket layer here. If the socket is used in a subsequent file, you don't need to set it again. blog_address=raw_input("Please enter your blog address (just enter the last part, for example if your blog address is /jiangafu, just enter jiangafu):") blog_address=blog_address.replace('\r','') begin=raw_input('From the first article:') begin=(begin) while begin<=0: begin=raw_input('Please enter a number greater than 0:') begin=(begin) end=raw_input('To the end of the first post (enter 0 at the end):') end=(end) while end<0: end=raw_input('Please enter a number greater than or equal to 0:') end=(end) if end==0: print 'Your blog address is:/'+blog_address+', save the first '+begin.__str__()+'Episode to the last blog post' else: print 'Your blog address is:/'+blog_address+', save the first '+begin.__str__()+' to p. '\ +end.__str__()+'A blog post' starttime = () text=read_date_from_url('/'+blog_address) (0.5) # Extract the url of the "blog post directory". result = (text) if result is not None: print 'Blog post catalog address:' , (1) text=read_date_from_url((1)) (0.4) else: print 'Failed to extract blog post directory address' # Terminate the program () #Find all blog posts on each page, analyze, extract, save while True: page+=1 print 'Starting Backup No.' , page , 'Page' #Match all blog post addresses on this page result=(text) #Recurring downloads for each blog post on this page for blog in result: if blog_no < begin: blog_no += 1 elif end != 0 and blog_no > end: break else: try: save_to_file(blog[1],unicode(blog[0]),blog_address) except: print 'Oh, save No.',blog_no,'A blog post',blog[0],'There's a problem with skipping...' blog_no += 1 print "Unexpected error:", sys.exc_info()[0],sys.exc_info()[1] else: print 'Successfully saved the first ', blog_no, 'A blog post:', blog[0] blog_no += 1 saved += 1 (0.4) #Determine if there is a next page result = (text) if result is not None: text = read_date_from_url((1)) else: print 'This is the last page' break print 'Blog backup complete total backup',saved,'A blog post' print 'Shared time:',() - starttime raw_input('Press enter to exit...')
Readers interested in more Python related content can check out this site's topic: thePython Data Structures and Algorithms Tutorial》、《Python Socket Programming Tips Summary》、《Summary of Python function usage tips》、《Summary of Python string manipulation techniques》、《Python introductory and advanced classic tutorialsand theSummary of Python file and directory manipulation techniques》
I hope that what I have said in this article will help you in Python programming.