SoFunction
Updated on 2024-10-30

Example of python implementation of disk log cleaning

I. Description:

Component python code as a module for better reusability in disk file cleanup

II. Reaching the target:

Empty outdated log files, clean up log files that are larger than the customized size.

III. Original code

#!/usr/bin/env python
# -*- coding: utf-8 -*-
 
import commands
import os
import time
import re
import getopt
import sys
 
# Return a two-element tuple tuple(status, result), status is an int, result is a string
def execute_local_shell_cmd(cmd):
  status, result = (cmd)
 
  result = ("\n")
 
  return status, result
 
def send_alert_mail():
  pass
 
 
 
'''
Getting the space usage of a disk
'''
def get_disk_used(disk_name):
  status, result = execute_local_shell_cmd("df | grep %s | awk '{print $5}'" % disk_name)
  return status, result[0]
 
#print(get_disk_used('/data0'))
 
 
'''
Determine if a file has been modified within the specified time
'''
 
def file_modify_in(file_path,time_interval='1d'):
  current_time = ()
  # Returns the last modification time. Returns the number of jump seconds since the unix epoch.
  if current_time - (file_path) < translate_time_interval_to_second(time_interval):
    return True
  return False
 
def translate_file_size_to_kb(file_size):
  # Convert all uppercase characters of a string to lowercase
  file_size = str(file_size.lower())
  # Create a pattern object that matches a number one or more times and the decimal point occurs once or not; the number repeats zero or more times after the decimal point
  pattern = (r'\d+\.?\d*')
  match = (file_size)
  file_size_number = None
  if match:
    # Use Match to get grouping information
    #print(())
    file_size_number = float(())
  else:
    raise IOError("Input {0} can't translate to byte."
           "Current support g(gb)/m(mb)/k(kb)/b(byte)".format(file_size))
  The # endswith() method is used to determine whether a string ends with the specified suffix, returning True if it ends with the specified suffix and False otherwise.
  # The optional parameters "start" and "end" are the start and end positions of the retrieved string.
  if file_size.endswith("g") or file_size.endswith("gb"):
    return file_size_number * 1024 * 1024 * 1024
  elif file_size.endswith("m") or file_size.endswith("mb"):
    return file_size_number * 1024 * 1024
  elif file_size.endswith("k") or file_size.endswith("kb"):
    return file_size_number * 1024
  elif file_size.endswith("b") or file_size.endswith("byte"):
    return file_size_number
  else:
    raise IOError("Input {0} can't translate to byte."
            "Current support g(gb)/m(mb)/k(kb)/b(byte)".format(file_size))
#print(translate_file_size_to_kb('10g'))
 
def translate_time_interval_to_second(time_interval):
  date_interval = str(time_interval.lower())
  pattern = (r'\d+')
  match = (date_interval)
  date_interval_number = None
  if match:
    date_interval_number = int(())
  else:
    raise IOError("Input {0} can't translate to second."
           "Current support d(day)/h(hour)/m(min)/s(sec)".format(date_interval))
  if date_interval.endswith('d') or date_interval.endswith('day'):
    return date_interval_number * 24 * 3600
  elif date_interval.endswith('h') or date_interval.endswith('hour'):
    return date_interval_number * 3600
  elif date_interval.endswith('m') or date_interval.endswith('min'):
    return date_interval_number * 60
  elif date_interval.endswith('s') or date_interval.endswith('sec'):
    return date_interval_number
  else:
    raise IOError("Input {0} cant't translate to second."
           "Current support d(day)/h(hour)/m(min)/s(second)".format(date_interval))
 
#print(translate_time_interval_to_second('7d'))
'''
Whether the shutdown file may be the current log file
1) Within 1 day of the modification
2) End with pattern
'''
def probable_current_log_file(file_path,pattern='log',modify_in='1d'):
  if file_modify_in(file_path,time_interval=modify_in):
    return True
  return str(file_path).endswith(pattern)
 
'''
Get more than days to set the log, note that it will not return the file that may be currently being modified, check probable_current_log_file
Determine how to make that determination
'''
def get_clean_log_list_by_date(target_dir,before_days_remove='7d',pattern="log"):
  before_seconds_remove = translate_time_interval_to_second(before_days_remove)
  current_time = ()
  # Returns a list of names of files or folders contained in the specified folder
  for candidate_file in (target_dir):
    candidate_file_fullpath = "%s/%s" %(target_dir,candidate_file)
    # Whether a common file exists
    if (candidate_file_fullpath):
      candidate_file_mtime = (candidate_file_fullpath)
 
      # find\(\) according to whether it contains a string, if it does, return the beginning of the index value, otherwise return -1
      if current_time - candidate_file_mtime > before_seconds_remove \
        and candidate_file.find(pattern) != -1 \
        and not probable_current_log_file(candidate_file_fullpath):
        # yield is to return a value and remember the position of the returned value so that the next iteration starts from that position.
        yield candidate_file_fullpath
 
'''
Get log files that are larger than size (note that files with less than 1 day of modification are not returned by default)
'''
def get_clean_log_list_by_size(target_dir,file_size_limit='10g',pattern="log"):
  file_size_limit_byte = translate_file_size_to_kb(file_size_limit)
  for candidate_file in (target_dir):
    candidate_file_fullpath = "%s/%s" %(target_dir,candidate_file)
    if (candidate_file_fullpath):
      # stat returns system status information about the file in question
      file_stat = (candidate_file_fullpath)
      if candidate_file.find(pattern) != -1 and \
              file_stat.st_size >= file_size_limit_byte:
        yield candidate_file_fullpath
      # If the file has been modified within modify_in, it is not returned.
      # if not (modify_in and file_modify_in(candidate_file_fullpath, time_interval=modify_in)) and \
      #   not probable_current_log_file(candidate_file_fullpath):
      #    yield candidate_file_fullpath
 
'''
Remove file list
'''
def remove_file_list(file_list,pattern='log',roll_back=False):
  for file_item in file_list:
    if roll_back or probable_current_log_file(file_item,pattern=pattern,modify_in='1d'):
      print('roll back file %s' % file_item)
      execute_local_shell_cmd("cat /dev/null > {0}".format(file_item))
    else:
      print('remove file %s' % file_item)
      # Delete the specified path file. If the specified path is a directory, OSError will be thrown
      (file_item)
 
'''
Cleaning out over-dated log files
'''
def remove_files_by_date(target_dir,before_days_remove='7d',pattern='log'):
  file_list = get_clean_log_list_by_date(target_dir,before_days_remove,pattern)
  remove_file_list(file_list)
 
'''
Clean up log files that are oversized
'''
def remove_files_by_size(target_dir,file_size_limit='10g',pattern='log'):
  file_list = get_clean_log_list_by_size(target_dir,file_size_limit,pattern)
  remove_file_list(file_list)
 
'''
Empty the current log file, use cat /dev/null > {log_file} method
'''
 
def clean_curren_log_file(target_dir,file_size_limit='10g',pattern='log'):
  for candidate_file in (target_dir):
    candidate_file_fullpath = '%s/%s' % (target_dir,candidate_file)
    if candidate_file.endswith(pattern) and (candidate_file_fullpath):
      file_stat = (candidate_file_fullpath)
      if file_stat.st_size >= translate_file_size_to_kb(file_size_limit):
        remove_file_list([candidate_file_fullpath],roll_back=True)
 
def clean_data_release_disk(disk_name, target_dir, disk_used_limit='80%', before_days_remove='7d',
              file_size_limit='10g', pattern='log'):
  disk_used_limit = disk_used_limit.replace('%', '')
  # The first step is to perform a time-based log cleanup
  print('Step one remove files {0} ago.'.format(before_days_remove))
  remove_files_by_date(target_dir, before_days_remove=before_days_remove, pattern=pattern)
 
  # If the disk space is still not fully freed, perform a size-based log cleanup.
  current_disk_used = int(get_disk_used(disk_name)[1].replace('%', ''))
  if current_disk_used > int(disk_used_limit):
    print("Disk {0}'s current used {1}% great than input used limit {2}%,"
       "so we will remove files bigger than {3}".
       format(disk_name, current_disk_used, disk_used_limit, file_size_limit))
    remove_files_by_size(target_dir, file_size_limit=file_size_limit, pattern=pattern)
 
  # If disk space is not freed, empty the log file currently being written, and alert
  current_disk_used = int(get_disk_used(disk_name)[1].replace('%', ''))
  if current_disk_used > int(disk_used_limit):
    print("Disk {0}'s current used {1}% great than input used limit {2}%,"
       "so we will roll back current log file".
       format(disk_name, current_disk_used, disk_used_limit, file_size_limit))
    clean_curren_log_file(target_dir, file_size_limit=file_size_limit, pattern=pattern)
 
  # If it's still not there, alert mail #
  if int(get_disk_used(disk_name)[1].replace('%', '')) > int(disk_used_limit):
    send_alert_mail()
 
def usage():
  print(' -d <target_disk> -r <target_dirctory -u <diskUsedLimit(default 80%)> '
     '-f <fileSizeLimit(default 10gb,gb/mb/kb)> -p <filePattern(default log)> '
     '-t <beforeDaysRemove(default 7d,d)> ')
if __name__ == "__main__":
  target_disk_input = '/data0'
  target_dir_input = '/data0/hadoop2/logs'
  disk_used_limit_input = '80%'
  file_size_limit_input = '10g'
  pattern_input = 'log'
  before_days_remove_input = '7d'
  try:
    # getopt command parsing with short and long options
    # getopt returns two arguments: one corresponding to the parameter options and value tuple, and the other generally empty
    opts,args = ([1:], 'hd:r:u:f:p:t:', ['help' 'disk=', 'directory=',
                                  'diskUsedLimit=', 'fileSizeLimit=',
                                  'filePattern=', 'beforeDaysRemove='])
  # getopt module function exception error, catch exception and print error
  except  as err:
    print err
    usage()
    (2)
 
  if len(opts) < 6:
    usage()
    (2)
 
  for opt,arg in opts:
    if opt == '-h':
      usage()
      ()
    elif opt in ("-d","--disk"):
      target_disk_input = ('/','')
    elif opt in ("-r","--directory"):
      target_dir_input = arg
    elif opt in ("-u","--diskUsedLimit"):
      disk_used_limit_input = arg
    elif opt in ("-f","--fileSizeLimit"):
      file_size_limit_input = arg
      translate_file_size_to_kb(file_size_limit_input)
    elif opt in ("-p","filePattern"):
      pattern_input = arg
    elif opt in ("-t","--beforeDaysRemove"):
      before_days_remove_input = arg
      translate_time_interval_to_second(before_days_remove_input)
 
  print ("{0} Start clean job.target_disk:{1},target_directory:{2},disk_used_limit:{3},"
      "file_size_limit:{4},pattern:{5},before_days_remove:{6}".format((()),
                                      target_disk_input, target_dir_input,
                                      disk_used_limit_input, file_size_limit_input,
                                      pattern_input, before_days_remove_input))
  clean_data_release_disk(target_disk_input, target_dir_input,
              disk_used_limit=disk_used_limit_input, file_size_limit=file_size_limit_input,
              pattern=pattern_input, before_days_remove=before_days_remove_input)

IV. Timed deletion of the unified call directory

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
 
# Traverse the catalog
def Lisdir(targetdir):
  list_dirs = (targetdir)
  for root,list_dirs,files in list_dirs:
    for d in list_dirs:
      yield (root,d)
 
def log_dir(targetdir):
  list_dirs = (targetdir)
  for ph in list_dirs:
    if ((targetdir,ph)):
      yield Lisdir((targetdir,ph))
for path in log_dir('/data0/backup_log-bin'):
  for ppp in path:
    # ending with log-bin False
    if ('log-bin') is False:
      ("db_script/clean_robo.py -d /data0 -r {0} -u 75% -f 501M -p bin -t 5d".format(ppp))

The above is the python implementation of disk log cleanup example of the details, more information about python disk log cleanup please pay attention to my other related articles!