General Labeling
odue_df=df_train_stmt.loc[(df_train_stmt.AGE3>0)|(df_train_stmt.AGE4>0)|(df_train_stmt.AGE5>0)|(df_train_stmt.AGE6>0),['XACCOUNT']].drop_duplicates() odue_df['label']=1 cust_df=df_acct[['CUSTR_NBR','XACCOUNT']].drop_duplicates() # Do the merge df_y=(cust_df,odue_df,how='left',on='XACCOUNT').groupby('CUSTR_NBR').agg({'label':max}).reset_index().fillna(0)
Using functions for labeling
# Label Label def label(row): if row['Date_received'] == 'null': return -1 if row['Date'] != 'null': td = pd.to_datetime(row['Date'], format='%Y%m%d') - pd.to_datetime(row['Date_received'], format='%Y%m%d') if td <= (15, 'D'): return 1 return 0 dfoff['label'] = (label, axis=1) #Tagging to determine the number of days def get_label(s): s = (':') if s[0]=='null': return 0 elif (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8]))-date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days<=15: return 1 else: return -1 = (get_label)
Supplementary: python get tag content based on tag name
Look at the code.
import re import json import requests from bs4 import BeautifulSoup import from lxml import etree result = ('/places/default/view/Algeria-4') with open('', 'wb') as f: () # print(parse_regex()) test_data = """ <div> <ul> <li class="item-0"><a href="" rel="external nofollow" rel="external nofollow" >9,596,960first item</a></li> <li class="item-1"><a href="" rel="external nofollow" >second item</a></li> <li class="item-inactive"><a href="" rel="external nofollow" >third item</a></li> <li class="item-1"><a href="" rel="external nofollow" >fourth item</a></li> <li class="item-0"><a href="" rel="external nofollow" rel="external nofollow" >fifth item</a></li> <li class="good-0"><a href="" rel="external nofollow" rel="external nofollow" >fifth item</a></li> </ul> <book> <title lang="aaengbb">Harry Potter</title> <price >29.99</price> </book> <book> <title lang="zh">Learning XML</title> <price>39.95</price> </book> <book> <title>Python</title> <price>40</price> </book> </div> """ # //div/ul/li/a[@id] selects a tag with an id attribute in the a tag # //div/ul/li/a Select all a tags # //div/ul/li[2]/a """ / Start with the root tag Must have strict paternity // Select from the current tag subsequent nodes containing * wildcard character (computing) Select all //div/book[1]/title Select the title tag of the first book tag under the div //div/book[1]/tittle[@lang="zh"] Select the title tag of the first book tag under the div and the title tag whose content is zh. //div/book/title //book/title //title has the same result but with a different fetch path. //book/title/@* selects all attribute values for title //book/title/text() selects the content of the title, using the built-in function //a[@href="" rel="external nofollow" rel="external nofollow" and @] //div/book/[last()]/title/text() picks out the last book element //div/book[price > 39]/title/text() selects out book sub-tags with price values greater than 39 //li[starts-with(@class,'item')] picks out the class attribute prefixed with item //title[contains(@lang, "eng")] picks out tags whose title attribute lang contains the eng keyword """ html = (test_data) # Load any string html_data = ('//title[contains(@lang,"eng")]') # xpath Find path # print(dir(html_data[0])) # see what html_data has to offer print(html_data) for i in html_data: print()
The above is a personal experience, I hope it can give you a reference, and I hope you can support me more.