Detailed explanation of format cleaning tools based on xpath selector, PyQuery, and regular expressions

1. Use xpath to clean unnecessary tag elements, and no content tags

from lxml import etree
 
def xpath_clean(self, text: str, xpath_dict: dict) -&gt; str:
    '''
     xpath clears unnecessary elements
     :param text: html_content
     :param xpath_dict: Clear target xpath
     :return: string type html_content
     '''
    remove_by_xpath = xpath_dict if xpath_dict else dict()
 
    # Items that must be cleared unless extreme situations generally these are to be cleared    remove_by_xpath.update({
      '_remove_2': '//iframe',
      '_remove_4': '//button',
      '_remove_5': '//form',
      '_remove_6': '//input',
      '_remove_7': '//select',
      '_remove_8': '//option',
      '_remove_9': '//textarea',
      '_remove_10': '//figure',
      '_remove_11': '//figcaption',
      '_remove_12': '//frame',
      '_remove_13': '//video',
      '_remove_14': '//script',
      '_remove_15': '//style'
    })
 
    parser = (remove_blank_text=True, remove_comments=True)
    selector = (text, parser=parser)
 
    # Regular deletion operation, unwanted tag deletion    for xpath in remove_by_xpath.values():
      for bad in (xpath):
        bad_string = (bad, encoding='utf-8',
                      pretty_print=True).decode()
        (f"clean article content : {bad_string}")
        ().remove(bad)
 
    skip_tip = "name()='img' or name()='tr' or " \
          "name()='th' or name()='tbody' or " \
          "name()='thead' or name()='table'"
    # Determine whether all p tags exist, and delete them directly if they do not exist.    for p in (f"//*[not({skip_tip})]"):
      # Skip logic      if (f".//*[{skip_tip}]") or \
          bool(('\s', '', ('string(.)'))):
        continue
 
      bad_p = (p, encoding='utf-8',
                  pretty_print=True).decode()
      (f"clean p tag : {bad_p}")
      ().remove(p)
 
    return (selector, encoding='utf-8',
               pretty_print=True).decode()

2. Use pyquery to clean up the tag attributes and return the processed source code and pure text

#!/usr/bin/env python
# -*-coding:utf-8-*-
 
from pyquery import PyQuery as pq
 
def pyquery_clean(self, text, url, pq_dict) -&gt; object:
    '''
     pyquery makes necessary processing,
     :param text:
     :param url:
     :param pq_dict:
     :return:
     '''
    # Delete the pq expression dictionary    remove_by_pq = pq_dict if pq_dict else dict()
    # Tag attribute whitelist    attr_white_list = ['rowspan', 'colspan']
    # Image link key    img_key_list = ['src', 'data-echo', 'data-src', 'data-original']
    # Generate pyquery object    dom = pq(text)
 
    # Remove useless tags    for bad_tag in remove_by_pq.values():
      for bad in dom(bad_tag):
        bad_string = pq(bad).html()
        (f"clean article content : {bad_string}")
      (bad_tag)
 
    # Tags for each attribute processing    for tag in dom('*'):
      for key, value in ():
        # Skip the logic and preserve the rowspan and colspan properties of the table        if key in attr_white_list:
          continue
        # Process image links, incomplete url, replace after supplementing and replacing        if key in img_key_list:
          img_url = self.absolute_url(url, value)
          pq(tag).remove_attr(key)
          pq(tag).attr('src', img_url)
          pq(tag).attr('alt', '')
        # The alt attribute of the img tag is left empty        elif key == 'alt':
          pq(tag).attr(key, '')
        # Delete all other properties        else:
          pq(tag).remove_attr(key)
 
    return (), ()

3. Regular expression cleans up spaces and line breaks

#!/usr/bin/env python
# -*-coding:utf-8-*-
 
import re  
 
def regular_clean(self, str1: str, str2: str):
    '''
     Regular expression processing data format
     :param str1: content
     :param str2: html_content
     :return: Return the processed result
     '''
 
    def new_line(text):
      text = ('&lt;br\s?/?&gt;', '&lt;br&gt;', text)
      text = (
        '&lt;/?a&gt;|&lt;/?em&gt;|&lt;/?html&gt;|&lt;/?body&gt;|'
        '&lt;/?head&gt;|&lt;[a-zA-Z]{1,10}\s?/&gt;|'
        '&lt;/?strong&gt;|&lt;/?blockquote&gt;|&lt;/?b&gt;|'
        '&lt;/?span&gt;|&lt;/?i&gt;|&lt;/?hr&gt;|&lt;/?font&gt;',
        '',
        text)
      text = ('\n', '', text)
      text = ('&lt;h[1-6]&gt;', '&lt;p&gt;', text)
      text = ('&lt;/h[1-6]&gt;', '&lt;/p&gt;', text)
      text = ('&lt;/p&gt;', '&lt;/p&gt;\n').replace('&lt;br&gt;', '&lt;br/&gt;')
      return text
 
    str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO handles blank line issues 
    # TODO html_content processing 1. Delete unnecessary and unusable tags and tags that affect data display 2. Deal with line breaks and replace 
    str2 = new_line(text=str2)
 
    return str1, str2

At the end, each method encapsulates the class code display

#!/usr/bin/env python
# -*-coding:utf-8-*-
'''
 author: szhan
 date: 2020-08-17
 Summery: Clean html_conent and get pure data format
 '''
 
import re
from lxml import etree
from pyquery import PyQuery as pq
from  import urlsplit, urljoin
 
from loguru import logger
 
 
class CleanArticle:
 
  def __init__(
      self,
      text: str,
      url: str = '',
      xpath_dict: dict = None,
      pq_dict: dict = None
  ):
     = text
     = url
    self.xpath_dict = xpath_dict or dict()
    self.pq_dict = pq_dict or dict()
 
  @staticmethod
  def absolute_url(baseurl: str, url: str) -&gt; str:
    '''
     Supplementary url
     :param baseurl:scheme url
     :param url: target url
     :return: complete url
     '''
    target_url = url if urlsplit(url).scheme else urljoin(baseurl, url)
    return target_url
 
  @staticmethod
  def clean_blank(text):
    '''
     Blank treatment
     :param text:
     :return:
     '''
    text = ('&amp;#13;', '').replace('\u3000', '').replace('\t', '').replace('\xa0', '')
    text = ('\s{2,}', '', text)
    text = ('\n{2,}', '\n', text)
    text = ('\n').strip()
    return text
 
  def run(self):
    '''
     :return:processed content, html_content
     '''
    if (not bool()) or (not isinstance(, str)):
      raise ValueError('html_content has a bad type value')
    # First, use xpath to remove spaces, and comments, iframe, button, form, script, style, video and other tags    text = self.xpath_clean(, self.xpath_dict)
 
    # Step 2: Use pyquery to handle specific details    str1, str2 = self.pyquery_clean(text, , self.pq_dict)
 
    # Final regular processing    content, html_content = self.regular_clean(str1, str2)
 
    return content, html_content
 
  def xpath_clean(self, text: str, xpath_dict: dict) -&gt; str:
    '''
     xpath clears unnecessary elements
     :param text: html_content
     :param xpath_dict: Clear target xpath
     :return: string type html_content
     '''
    remove_by_xpath = xpath_dict if xpath_dict else dict()
 
    # Items that must be cleared unless extreme situations generally these are to be cleared    remove_by_xpath.update({
      '_remove_2': '//iframe',
      '_remove_4': '//button',
      '_remove_5': '//form',
      '_remove_6': '//input',
      '_remove_7': '//select',
      '_remove_8': '//option',
      '_remove_9': '//textarea',
      '_remove_10': '//figure',
      '_remove_11': '//figcaption',
      '_remove_12': '//frame',
      '_remove_13': '//video',
      '_remove_14': '//script',
      '_remove_15': '//style'
    })
 
    parser = (remove_blank_text=True, remove_comments=True)
    selector = (text, parser=parser)
 
    # Regular deletion operation, unwanted tag deletion    for xpath in remove_by_xpath.values():
      for bad in (xpath):
        bad_string = (bad, encoding='utf-8',
                      pretty_print=True).decode()
        (f"clean article content : {bad_string}")
        ().remove(bad)
 
    skip_tip = "name()='img' or name()='tr' or " \
          "name()='th' or name()='tbody' or " \
          "name()='thead' or name()='table'"
    # Determine whether all p tags exist, and delete them directly if they do not exist.    for p in (f"//*[not({skip_tip})]"):
      # Skip logic      if (f".//*[{skip_tip}]") or \
          bool(('\s', '', ('string(.)'))):
        continue
 
      bad_p = (p, encoding='utf-8',
                  pretty_print=True).decode()
      (f"clean p tag : {bad_p}")
      ().remove(p)
 
    return (selector, encoding='utf-8',
               pretty_print=True).decode()
 
  def pyquery_clean(self, text, url, pq_dict) -&gt; object:
    '''
     pyquery makes necessary processing,
     :param text:
     :param url:
     :param pq_dict:
     :return:
     '''
    # Delete the pq expression dictionary    remove_by_pq = pq_dict if pq_dict else dict()
    # Tag attribute whitelist    attr_white_list = ['rowspan', 'colspan']
    # Image link key    img_key_list = ['src', 'data-echo', 'data-src', 'data-original']
    # Generate pyquery object    dom = pq(text)
 
    # Remove useless tags    for bad_tag in remove_by_pq.values():
      for bad in dom(bad_tag):
        bad_string = pq(bad).html()
        (f"clean article content : {bad_string}")
      (bad_tag)
 
    # Tags for each attribute processing    for tag in dom('*'):
      for key, value in ():
        # Skip the logic and preserve the rowspan and colspan properties of the table        if key in attr_white_list:
          continue
        # Process image links, incomplete url, replace after supplementing and replacing        if key in img_key_list:
          img_url = self.absolute_url(url, value)
          pq(tag).remove_attr(key)
          pq(tag).attr('src', img_url)
          pq(tag).attr('alt', '')
        # The alt attribute of the img tag is left empty        elif key == 'alt':
          pq(tag).attr(key, '')
        # Delete all other properties        else:
          pq(tag).remove_attr(key)
 
    return (), ()
 
  def regular_clean(self, str1: str, str2: str):
    '''
     Regular expression processing data format
     :param str1: content
     :param str2: html_content
     :return: Return the processed result
     '''
 
    def new_line(text):
      text = ('&lt;br\s?/?&gt;', '&lt;br&gt;', text)
      text = (
        '&lt;/?a&gt;|&lt;/?em&gt;|&lt;/?html&gt;|&lt;/?body&gt;|'
        '&lt;/?head&gt;|&lt;[a-zA-Z]{1,10}\s?/&gt;|'
        '&lt;/?strong&gt;|&lt;/?blockquote&gt;|&lt;/?b&gt;|'
        '&lt;/?span&gt;|&lt;/?i&gt;|&lt;/?hr&gt;|&lt;/?font&gt;',
        '',
        text)
      text = ('\n', '', text)
      text = ('&lt;h[1-6]&gt;', '&lt;p&gt;', text)
      text = ('&lt;/h[1-6]&gt;', '&lt;/p&gt;', text)
      text = ('&lt;/p&gt;', '&lt;/p&gt;\n').replace('&lt;br&gt;', '&lt;br/&gt;')
      return text
 
    str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO handles blank line issues 
    # TODO html_content processing 1. Delete unnecessary and unusable tags and tags that affect data display 2. Deal with line breaks and replace 
    str2 = new_line(text=str2)
 
    return str1, str2
 
if __name__ == '__main__':
  with open('html_content.html', 'r', encoding='utf-8') as f:
    lines = ()
    html = ''
    for line in lines:
      html += line
  ca = CleanArticle(text=html)
  _, html_content = ()
  print(html_content)

Summarize

This is the article about the detailed explanation of the format cleaning tool based on xpath selector, PyQuery, and regular expressions. For more information about PyQuery and regular expressions, please search for my previous articles or continue browsing the related articles below. I hope everyone will support me in the future!