python 字符串中的数字、日期、百分比和年份转中文

在一个预料数据集的处理过程中,遇到了这样的场景,需要把一段字符串中的阿拉伯数字转成中文,
但是遇到年份的问题比较复杂,比如"1987年"转成中文是“一九八七年”,而一般的"59岁",就是“五十九岁”
我的思路是先利用正则处理带有年份的数字部分,然后再进行其他数字的处理,
其中要注意的一个细节是,这里用的re.sub方法第二个参数传入一个函数,方便针对匹配上的部分进行替换,而这个函数传入的参数需要是match对象,通过调用match.group()返回当前匹配到的字符串部分,从而进行处理,具体的代码如下:

def match_year_digit(match):
    m = str(match.group())
    relation = {'1':'一', '2':'二','3':'三','4':'四','5':'五','6':'六','7':'七','8':'八','9':'九','0':'零','年':'年'}
    return ''.join([relation[i] for i in m])
def time_thin_filter(sequence):
    time_thin_reg = re.compile(r'(?P<hour_part>\d{1,2})[:|:](?P<minute_part>\d{1,2})([:|:])?(?P<second_part>(\d{1,2}))?')
    result = time_thin_reg.search(sequence)
    unit_list = ['时', '分', '']
    reg_list = []
        hour_part = result.group('hour_part')
        reg_list.append(match_common_number(hour_part))
        reg_list.append(unit_list[0])
        minute_part = result.group('minute_part')
        reg_list.append(match_common_number(minute_part))
        reg_list.append(unit_list[1])
        second_part = result.group('second_part')
        reg_list.append(match_common_number(second_part))
        reg_list.append(unit_list[2])
    except IndexError as e:
        print('')
    finally:
        return ''.join(reg_list)
    return ''
def date_thin_filter(sequence):
    time_thin_reg = re.compile(r'(?P<year_part>[1|2]\d{3})[-|\/](?P<month_part>(1[0-2]|[1-9]))(-|\/)?(?P<day_part>([30|31]|2[0-9]|1[0-9]|[1-9]))?')
    result = time_thin_reg.search(sequence)
    unit_list = ['年', '月', '日']
    reg_list = []
        year_part = result.group('year_part')
        if re.findall('1[0-9]{3}', year_part):    
            year_part = re.sub(u'1[0-9]{3}', match_year_digit, year_part)
        else:
            year_part = match_common_number(year_part)
        reg_list.append(year_part)
        reg_list.append(unit_list[0])
        month_part = result.group('month_part')
        reg_list.append(match_common_number(month_part))
        reg_list.append(unit_list[1])
        day_part = result.group('day_part')
        reg_list.append(match_common_number(day_part))
        reg_list.append(unit_list[2])
    except IndexError as e:
        print('')
    finally:
        return ''.join(reg_list)
    return ''
def first_number_filter(sequence="1989年是一个夏天,他59岁了,占有公司89%的股份, 后来死于1999-12-30 13:45:12~15:50,享年102岁", count=4):
    # 数字是“年份”的情况,返回转化成中文的结果
    # (0)例子
    # input: "1989年是一个夏天,他59岁了,占有公司89%的股份, 后来死于1999-12-30 13:45:12~15:50,享年102岁"
    # output: "一九八九年是一个夏天,他五十九岁了,占有公司百分之八十九的股份, 后来死于一九九九年十二月三日 十三时四十五分十二到十五时五十分,享年一百零二岁"
    s = sequence
    # (1)过滤年份,上个世纪的,本世纪的可以不用
    if re.findall('1[0-9]{3}年', sequence):    
        s = re.sub(u'1[0-9]{3}年', match_year_digit, sequence, count=count)
    # (2)过滤百分比
    percent_filter = re.compile(r'(?P<percent_part>\d+)(?P<percent_mark>%)')
    result = percent_filter.search(s)
    percent_part = result.group('percent_part')
    if percent_part:
        x = match_common_number(percent_part)
        s = re.sub(percent_filter, '百分之'+x, s)
    # (3)过滤时间
    # time_filter:能匹配'12:09:00~12:31:30' '12:09:00'
    time_filter = re.compile(r'(?P<start_time_part>\d{1,2}([:|:]\d{1,2}){1,2})(?P<time_mark>(~|-)?)(?P<end_time_part>(\d{1,2}([:|:]\d{1,2}){1,2})?)')
    result = time_filter.search(s)
    start_time_part = result.group('start_time_part')
    end_time_part = result.group('end_time_part')
    time_substitude_part = ''
        start_time_part = result.group('start_time_part')
        time_substitude_part = time_substitude_part + time_thin_filter(start_time_part)
        time_mark_part = result.group('time_mark')
        time_substitude_part += '到'
        end_time_part = result.group('end_time_part')
        time_substitude_part += time_thin_filter(end_time_part)
    except IndexError as e:
        print('')
    if time_substitude_part:
        s = re.sub(time_filter, time_substitude_part, s)
    # (4)过滤日期
    date_filter = re.compile(r'(?P<year_part>[1|2]\d{3})[-|\/](?P<month_part>(1[0-2]|[1-9]))(-|\/)?(?P<day_part>([30|31]|2[0-9]|1[0-9]|[1-9]))?')
    result = date_filter.search(s)
    date_part = result.group()
    if date_part:
        x = date_thin_filter(date_part)
        s = re.sub(date_filter, x, s)
    # (last)过滤其他数字
    return re.sub(u'\d+', match_common_number, s)
def match_common_number(match):
    recursive_depth=0
    number = match if(type(match) is type('')) else match.group()
    # 数字非年份的情况,返回转化成中文的结果
    str_number = str(number)
    if len(str_number) > 4:
        str_number = str_number[-4:]
    bits = "零 一 二 三 四 五 六 七 八 九".split(" ")
    units = " 十 百 千".split(" ")
    large_unit = ' 万 亿 万'.split(" ")  # 可扩展,以万为单位
    number_len = len(str_number)
    result = ""
    for i in range(number_len):
        result += bits[int(str_number[i])]
        if str_number[i] != "0":
            result += units[number_len - i - 1]
    # 去除连续的零
    while "零零" in result:
        result = result.replace("零零", "零")
    # 去除尾部的零
    if result[-1] == "零":
        result = result[:-1]
    # 调整10~20之间的数
    if result[:2] == "一十":
        result = result[1:]
    # 字符串连接上大单位
    result += large_unit[recursive_depth]
    # 判断是否递归
    if len(str(number)) > 4: