在一个预料数据集的处理过程中,遇到了这样的场景,需要把一段字符串中的阿拉伯数字转成中文,
但是遇到年份的问题比较复杂,比如"1987年"转成中文是“一九八七年”,而一般的"59岁",就是“五十九岁”
我的思路是先利用正则处理带有年份的数字部分,然后再进行其他数字的处理,
其中要注意的一个细节是,这里用的re.sub方法第二个参数传入一个函数,方便针对匹配上的部分进行替换,而这个函数传入的参数需要是match对象,通过调用match.group()返回当前匹配到的字符串部分,从而进行处理,具体的代码如下:
def match_year_digit(match):
m = str(match.group())
relation = {'1':'一', '2':'二','3':'三','4':'四','5':'五','6':'六','7':'七','8':'八','9':'九','0':'零','年':'年'}
return ''.join([relation[i] for i in m])
def time_thin_filter(sequence):
time_thin_reg = re.compile(r'(?P<hour_part>\d{1,2})[:|:](?P<minute_part>\d{1,2})([:|:])?(?P<second_part>(\d{1,2}))?')
result = time_thin_reg.search(sequence)
unit_list = ['时', '分', '']
reg_list = []
hour_part = result.group('hour_part')
reg_list.append(match_common_number(hour_part))
reg_list.append(unit_list[0])
minute_part = result.group('minute_part')
reg_list.append(match_common_number(minute_part))
reg_list.append(unit_list[1])
second_part = result.group('second_part')
reg_list.append(match_common_number(second_part))
reg_list.append(unit_list[2])
except IndexError as e:
print('')
finally:
return ''.join(reg_list)
return ''
def date_thin_filter(sequence):
time_thin_reg = re.compile(r'(?P<year_part>[1|2]\d{3})[-|\/](?P<month_part>(1[0-2]|[1-9]))(-|\/)?(?P<day_part>([30|31]|2[0-9]|1[0-9]|[1-9]))?')
result = time_thin_reg.search(sequence)
unit_list = ['年', '月', '日']
reg_list = []
year_part = result.group('year_part')
if re.findall('1[0-9]{3}', year_part):
year_part = re.sub(u'1[0-9]{3}', match_year_digit, year_part)
else:
year_part = match_common_number(year_part)
reg_list.append(year_part)
reg_list.append(unit_list[0])
month_part = result.group('month_part')
reg_list.append(match_common_number(month_part))
reg_list.append(unit_list[1])
day_part = result.group('day_part')
reg_list.append(match_common_number(day_part))
reg_list.append(unit_list[2])
except IndexError as e:
print('')
finally:
return ''.join(reg_list)
return ''
def first_number_filter(sequence="1989年是一个夏天,他59岁了,占有公司89%的股份, 后来死于1999-12-30 13:45:12~15:50,享年102岁", count=4):
# 数字是“年份”的情况,返回转化成中文的结果
# (0)例子
# input: "1989年是一个夏天,他59岁了,占有公司89%的股份, 后来死于1999-12-30 13:45:12~15:50,享年102岁"
# output: "一九八九年是一个夏天,他五十九岁了,占有公司百分之八十九的股份, 后来死于一九九九年十二月三日 十三时四十五分十二到十五时五十分,享年一百零二岁"
s = sequence
# (1)过滤年份,上个世纪的,本世纪的可以不用
if re.findall('1[0-9]{3}年', sequence):
s = re.sub(u'1[0-9]{3}年', match_year_digit, sequence, count=count)
# (2)过滤百分比
percent_filter = re.compile(r'(?P<percent_part>\d+)(?P<percent_mark>%)')
result = percent_filter.search(s)
percent_part = result.group('percent_part')
if percent_part:
x = match_common_number(percent_part)
s = re.sub(percent_filter, '百分之'+x, s)
# (3)过滤时间
# time_filter:能匹配'12:09:00~12:31:30' '12:09:00'
time_filter = re.compile(r'(?P<start_time_part>\d{1,2}([:|:]\d{1,2}){1,2})(?P<time_mark>(~|-)?)(?P<end_time_part>(\d{1,2}([:|:]\d{1,2}){1,2})?)')
result = time_filter.search(s)
start_time_part = result.group('start_time_part')
end_time_part = result.group('end_time_part')
time_substitude_part = ''
start_time_part = result.group('start_time_part')
time_substitude_part = time_substitude_part + time_thin_filter(start_time_part)
time_mark_part = result.group('time_mark')
time_substitude_part += '到'
end_time_part = result.group('end_time_part')
time_substitude_part += time_thin_filter(end_time_part)
except IndexError as e:
print('')
if time_substitude_part:
s = re.sub(time_filter, time_substitude_part, s)
# (4)过滤日期
date_filter = re.compile(r'(?P<year_part>[1|2]\d{3})[-|\/](?P<month_part>(1[0-2]|[1-9]))(-|\/)?(?P<day_part>([30|31]|2[0-9]|1[0-9]|[1-9]))?')
result = date_filter.search(s)
date_part = result.group()
if date_part:
x = date_thin_filter(date_part)
s = re.sub(date_filter, x, s)
# (last)过滤其他数字
return re.sub(u'\d+', match_common_number, s)
def match_common_number(match):
recursive_depth=0
number = match if(type(match) is type('')) else match.group()
# 数字非年份的情况,返回转化成中文的结果
str_number = str(number)
if len(str_number) > 4:
str_number = str_number[-4:]
bits = "零 一 二 三 四 五 六 七 八 九".split(" ")
units = " 十 百 千".split(" ")
large_unit = ' 万 亿 万'.split(" ") # 可扩展,以万为单位
number_len = len(str_number)
result = ""
for i in range(number_len):
result += bits[int(str_number[i])]
if str_number[i] != "0":
result += units[number_len - i - 1]
# 去除连续的零
while "零零" in result:
result = result.replace("零零", "零")
# 去除尾部的零
if result[-1] == "零":
result = result[:-1]
# 调整10~20之间的数
if result[:2] == "一十":
result = result[1:]
# 字符串连接上大单位
result += large_unit[recursive_depth]
# 判断是否递归
if len(str(number)) > 4: