1. Use xpath to clean unnecessary tag elements, and no content tags
from lxml import etree def xpath_clean(self, text: str, xpath_dict: dict) -> str: ''' xpath clears unnecessary elements :param text: html_content :param xpath_dict: Clear target xpath :return: string type html_content ''' remove_by_xpath = xpath_dict if xpath_dict else dict() # Items that must be cleared unless extreme situations generally these are to be cleared remove_by_xpath.update({ '_remove_2': '//iframe', '_remove_4': '//button', '_remove_5': '//form', '_remove_6': '//input', '_remove_7': '//select', '_remove_8': '//option', '_remove_9': '//textarea', '_remove_10': '//figure', '_remove_11': '//figcaption', '_remove_12': '//frame', '_remove_13': '//video', '_remove_14': '//script', '_remove_15': '//style' }) parser = (remove_blank_text=True, remove_comments=True) selector = (text, parser=parser) # Regular deletion operation, unwanted tag deletion for xpath in remove_by_xpath.values(): for bad in (xpath): bad_string = (bad, encoding='utf-8', pretty_print=True).decode() (f"clean article content : {bad_string}") ().remove(bad) skip_tip = "name()='img' or name()='tr' or " \ "name()='th' or name()='tbody' or " \ "name()='thead' or name()='table'" # Determine whether all p tags exist, and delete them directly if they do not exist. for p in (f"//*[not({skip_tip})]"): # Skip logic if (f".//*[{skip_tip}]") or \ bool(('\s', '', ('string(.)'))): continue bad_p = (p, encoding='utf-8', pretty_print=True).decode() (f"clean p tag : {bad_p}") ().remove(p) return (selector, encoding='utf-8', pretty_print=True).decode()
2. Use pyquery to clean up the tag attributes and return the processed source code and pure text
#!/usr/bin/env python # -*-coding:utf-8-*- from pyquery import PyQuery as pq def pyquery_clean(self, text, url, pq_dict) -> object: ''' pyquery makes necessary processing, :param text: :param url: :param pq_dict: :return: ''' # Delete the pq expression dictionary remove_by_pq = pq_dict if pq_dict else dict() # Tag attribute whitelist attr_white_list = ['rowspan', 'colspan'] # Image link key img_key_list = ['src', 'data-echo', 'data-src', 'data-original'] # Generate pyquery object dom = pq(text) # Remove useless tags for bad_tag in remove_by_pq.values(): for bad in dom(bad_tag): bad_string = pq(bad).html() (f"clean article content : {bad_string}") (bad_tag) # Tags for each attribute processing for tag in dom('*'): for key, value in (): # Skip the logic and preserve the rowspan and colspan properties of the table if key in attr_white_list: continue # Process image links, incomplete url, replace after supplementing and replacing if key in img_key_list: img_url = self.absolute_url(url, value) pq(tag).remove_attr(key) pq(tag).attr('src', img_url) pq(tag).attr('alt', '') # The alt attribute of the img tag is left empty elif key == 'alt': pq(tag).attr(key, '') # Delete all other properties else: pq(tag).remove_attr(key) return (), ()
3. Regular expression cleans up spaces and line breaks
#!/usr/bin/env python # -*-coding:utf-8-*- import re def regular_clean(self, str1: str, str2: str): ''' Regular expression processing data format :param str1: content :param str2: html_content :return: Return the processed result ''' def new_line(text): text = ('<br\s?/?>', '<br>', text) text = ( '</?a>|</?em>|</?html>|</?body>|' '</?head>|<[a-zA-Z]{1,10}\s?/>|' '</?strong>|</?blockquote>|</?b>|' '</?span>|</?i>|</?hr>|</?font>', '', text) text = ('\n', '', text) text = ('<h[1-6]>', '<p>', text) text = ('</h[1-6]>', '</p>', text) text = ('</p>', '</p>\n').replace('<br>', '<br/>') return text str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO handles blank line issues # TODO html_content processing 1. Delete unnecessary and unusable tags and tags that affect data display 2. Deal with line breaks and replace str2 = new_line(text=str2) return str1, str2
At the end, each method encapsulates the class code display
#!/usr/bin/env python # -*-coding:utf-8-*- ''' author: szhan date: 2020-08-17 Summery: Clean html_conent and get pure data format ''' import re from lxml import etree from pyquery import PyQuery as pq from import urlsplit, urljoin from loguru import logger class CleanArticle: def __init__( self, text: str, url: str = '', xpath_dict: dict = None, pq_dict: dict = None ): = text = url self.xpath_dict = xpath_dict or dict() self.pq_dict = pq_dict or dict() @staticmethod def absolute_url(baseurl: str, url: str) -> str: ''' Supplementary url :param baseurl:scheme url :param url: target url :return: complete url ''' target_url = url if urlsplit(url).scheme else urljoin(baseurl, url) return target_url @staticmethod def clean_blank(text): ''' Blank treatment :param text: :return: ''' text = ('&#13;', '').replace('\u3000', '').replace('\t', '').replace('\xa0', '') text = ('\s{2,}', '', text) text = ('\n{2,}', '\n', text) text = ('\n').strip() return text def run(self): ''' :return:processed content, html_content ''' if (not bool()) or (not isinstance(, str)): raise ValueError('html_content has a bad type value') # First, use xpath to remove spaces, and comments, iframe, button, form, script, style, video and other tags text = self.xpath_clean(, self.xpath_dict) # Step 2: Use pyquery to handle specific details str1, str2 = self.pyquery_clean(text, , self.pq_dict) # Final regular processing content, html_content = self.regular_clean(str1, str2) return content, html_content def xpath_clean(self, text: str, xpath_dict: dict) -> str: ''' xpath clears unnecessary elements :param text: html_content :param xpath_dict: Clear target xpath :return: string type html_content ''' remove_by_xpath = xpath_dict if xpath_dict else dict() # Items that must be cleared unless extreme situations generally these are to be cleared remove_by_xpath.update({ '_remove_2': '//iframe', '_remove_4': '//button', '_remove_5': '//form', '_remove_6': '//input', '_remove_7': '//select', '_remove_8': '//option', '_remove_9': '//textarea', '_remove_10': '//figure', '_remove_11': '//figcaption', '_remove_12': '//frame', '_remove_13': '//video', '_remove_14': '//script', '_remove_15': '//style' }) parser = (remove_blank_text=True, remove_comments=True) selector = (text, parser=parser) # Regular deletion operation, unwanted tag deletion for xpath in remove_by_xpath.values(): for bad in (xpath): bad_string = (bad, encoding='utf-8', pretty_print=True).decode() (f"clean article content : {bad_string}") ().remove(bad) skip_tip = "name()='img' or name()='tr' or " \ "name()='th' or name()='tbody' or " \ "name()='thead' or name()='table'" # Determine whether all p tags exist, and delete them directly if they do not exist. for p in (f"//*[not({skip_tip})]"): # Skip logic if (f".//*[{skip_tip}]") or \ bool(('\s', '', ('string(.)'))): continue bad_p = (p, encoding='utf-8', pretty_print=True).decode() (f"clean p tag : {bad_p}") ().remove(p) return (selector, encoding='utf-8', pretty_print=True).decode() def pyquery_clean(self, text, url, pq_dict) -> object: ''' pyquery makes necessary processing, :param text: :param url: :param pq_dict: :return: ''' # Delete the pq expression dictionary remove_by_pq = pq_dict if pq_dict else dict() # Tag attribute whitelist attr_white_list = ['rowspan', 'colspan'] # Image link key img_key_list = ['src', 'data-echo', 'data-src', 'data-original'] # Generate pyquery object dom = pq(text) # Remove useless tags for bad_tag in remove_by_pq.values(): for bad in dom(bad_tag): bad_string = pq(bad).html() (f"clean article content : {bad_string}") (bad_tag) # Tags for each attribute processing for tag in dom('*'): for key, value in (): # Skip the logic and preserve the rowspan and colspan properties of the table if key in attr_white_list: continue # Process image links, incomplete url, replace after supplementing and replacing if key in img_key_list: img_url = self.absolute_url(url, value) pq(tag).remove_attr(key) pq(tag).attr('src', img_url) pq(tag).attr('alt', '') # The alt attribute of the img tag is left empty elif key == 'alt': pq(tag).attr(key, '') # Delete all other properties else: pq(tag).remove_attr(key) return (), () def regular_clean(self, str1: str, str2: str): ''' Regular expression processing data format :param str1: content :param str2: html_content :return: Return the processed result ''' def new_line(text): text = ('<br\s?/?>', '<br>', text) text = ( '</?a>|</?em>|</?html>|</?body>|' '</?head>|<[a-zA-Z]{1,10}\s?/>|' '</?strong>|</?blockquote>|</?b>|' '</?span>|</?i>|</?hr>|</?font>', '', text) text = ('\n', '', text) text = ('<h[1-6]>', '<p>', text) text = ('</h[1-6]>', '</p>', text) text = ('</p>', '</p>\n').replace('<br>', '<br/>') return text str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO handles blank line issues # TODO html_content processing 1. Delete unnecessary and unusable tags and tags that affect data display 2. Deal with line breaks and replace str2 = new_line(text=str2) return str1, str2 if __name__ == '__main__': with open('html_content.html', 'r', encoding='utf-8') as f: lines = () html = '' for line in lines: html += line ca = CleanArticle(text=html) _, html_content = () print(html_content)
Summarize
This is the article about the detailed explanation of the format cleaning tool based on xpath selector, PyQuery, and regular expressions. For more information about PyQuery and regular expressions, please search for my previous articles or continue browsing the related articles below. I hope everyone will support me in the future!