textnorm_zh.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834
  1. #!/usr/bin/env python3
  2. # coding=utf-8
  3. # Authors:
  4. # 2019.5 Zhiyang Zhou (https://github.com/Joee1995/chn_text_norm.git)
  5. # 2019.9 Jiayu DU
  6. #
  7. # requirements:
  8. # - python 3.X
  9. # notes: python 2.X WILL fail or produce misleading results
  10. import sys, os, argparse, codecs, string, re
  11. # ================================================================================ #
  12. # basic constant
  13. # ================================================================================ #
  14. CHINESE_DIGIS = u'零一二三四五六七八九'
  15. BIG_CHINESE_DIGIS_SIMPLIFIED = u'零壹贰叁肆伍陆柒捌玖'
  16. BIG_CHINESE_DIGIS_TRADITIONAL = u'零壹貳參肆伍陸柒捌玖'
  17. SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = u'十百千万'
  18. SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = u'拾佰仟萬'
  19. LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'亿兆京垓秭穰沟涧正载'
  20. LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'億兆京垓秭穰溝澗正載'
  21. SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'十百千万'
  22. SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'拾佰仟萬'
  23. ZERO_ALT = u'〇'
  24. ONE_ALT = u'幺'
  25. TWO_ALTS = [u'两', u'兩']
  26. POSITIVE = [u'正', u'正']
  27. NEGATIVE = [u'负', u'負']
  28. POINT = [u'点', u'點']
  29. # PLUS = [u'加', u'加']
  30. # SIL = [u'杠', u'槓']
  31. FILLER_CHARS = ['呃', '啊']
  32. ER_WHITELIST = '(儿女|儿子|儿孙|女儿|儿媳|妻儿|' \
  33. '胎儿|婴儿|新生儿|婴幼儿|幼儿|少儿|小儿|儿歌|儿童|儿科|托儿所|孤儿|' \
  34. '儿戏|儿化|台儿庄|鹿儿岛|正儿八经|吊儿郎当|生儿育女|托儿带女|养儿防老|痴儿呆女|' \
  35. '佳儿佳妇|儿怜兽扰|儿无常父|儿不嫌母丑|儿行千里母担忧|儿大不由爷|苏乞儿)'
  36. # 中文数字系统类型
  37. NUMBERING_TYPES = ['low', 'mid', 'high']
  38. CURRENCY_NAMES = '(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|' \
  39. '里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)'
  40. CURRENCY_UNITS = '((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)'
  41. COM_QUANTIFIERS = '(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|' \
  42. '砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|' \
  43. '针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|' \
  44. '毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|' \
  45. '盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|' \
  46. '纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块)'
  47. # punctuation information are based on Zhon project (https://github.com/tsroten/zhon.git)
  48. CHINESE_PUNC_STOP = '!?。。'
  49. CHINESE_PUNC_NON_STOP = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏'
  50. CHINESE_PUNC_LIST = CHINESE_PUNC_STOP + CHINESE_PUNC_NON_STOP
  51. # ================================================================================ #
  52. # basic class
  53. # ================================================================================ #
  54. class ChineseChar(object):
  55. """
  56. 中文字符
  57. 每个字符对应简体和繁体,
  58. e.g. 简体 = '负', 繁体 = '負'
  59. 转换时可转换为简体或繁体
  60. """
  61. def __init__(self, simplified, traditional):
  62. self.simplified = simplified
  63. self.traditional = traditional
  64. #self.__repr__ = self.__str__
  65. def __str__(self):
  66. return self.simplified or self.traditional or None
  67. def __repr__(self):
  68. return self.__str__()
  69. class ChineseNumberUnit(ChineseChar):
  70. """
  71. 中文数字/数位字符
  72. 每个字符除繁简体外还有一个额外的大写字符
  73. e.g. '陆' 和 '陸'
  74. """
  75. def __init__(self, power, simplified, traditional, big_s, big_t):
  76. super(ChineseNumberUnit, self).__init__(simplified, traditional)
  77. self.power = power
  78. self.big_s = big_s
  79. self.big_t = big_t
  80. def __str__(self):
  81. return '10^{}'.format(self.power)
  82. @classmethod
  83. def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):
  84. if small_unit:
  85. return ChineseNumberUnit(power=index + 1,
  86. simplified=value[0], traditional=value[1], big_s=value[1], big_t=value[1])
  87. elif numbering_type == NUMBERING_TYPES[0]:
  88. return ChineseNumberUnit(power=index + 8,
  89. simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
  90. elif numbering_type == NUMBERING_TYPES[1]:
  91. return ChineseNumberUnit(power=(index + 2) * 4,
  92. simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
  93. elif numbering_type == NUMBERING_TYPES[2]:
  94. return ChineseNumberUnit(power=pow(2, index + 3),
  95. simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
  96. else:
  97. raise ValueError(
  98. 'Counting type should be in {0} ({1} provided).'.format(NUMBERING_TYPES, numbering_type))
  99. class ChineseNumberDigit(ChineseChar):
  100. """
  101. 中文数字字符
  102. """
  103. def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None):
  104. super(ChineseNumberDigit, self).__init__(simplified, traditional)
  105. self.value = value
  106. self.big_s = big_s
  107. self.big_t = big_t
  108. self.alt_s = alt_s
  109. self.alt_t = alt_t
  110. def __str__(self):
  111. return str(self.value)
  112. @classmethod
  113. def create(cls, i, v):
  114. return ChineseNumberDigit(i, v[0], v[1], v[2], v[3])
  115. class ChineseMath(ChineseChar):
  116. """
  117. 中文数位字符
  118. """
  119. def __init__(self, simplified, traditional, symbol, expression=None):
  120. super(ChineseMath, self).__init__(simplified, traditional)
  121. self.symbol = symbol
  122. self.expression = expression
  123. self.big_s = simplified
  124. self.big_t = traditional
  125. CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath
  126. class NumberSystem(object):
  127. """
  128. 中文数字系统
  129. """
  130. pass
  131. class MathSymbol(object):
  132. """
  133. 用于中文数字系统的数学符号 (繁/简体), e.g.
  134. positive = ['正', '正']
  135. negative = ['负', '負']
  136. point = ['点', '點']
  137. """
  138. def __init__(self, positive, negative, point):
  139. self.positive = positive
  140. self.negative = negative
  141. self.point = point
  142. def __iter__(self):
  143. for v in self.__dict__.values():
  144. yield v
  145. # class OtherSymbol(object):
  146. # """
  147. # 其他符号
  148. # """
  149. #
  150. # def __init__(self, sil):
  151. # self.sil = sil
  152. #
  153. # def __iter__(self):
  154. # for v in self.__dict__.values():
  155. # yield v
  156. # ================================================================================ #
  157. # basic utils
  158. # ================================================================================ #
  159. def create_system(numbering_type=NUMBERING_TYPES[1]):
  160. """
  161. 根据数字系统类型返回创建相应的数字系统,默认为 mid
  162. NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型
  163. low: '兆' = '亿' * '十' = $10^{9}$, '京' = '兆' * '十', etc.
  164. mid: '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc.
  165. high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc.
  166. 返回对应的数字系统
  167. """
  168. # chinese number units of '亿' and larger
  169. all_larger_units = zip(
  170. LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL)
  171. larger_units = [CNU.create(i, v, numbering_type, False)
  172. for i, v in enumerate(all_larger_units)]
  173. # chinese number units of '十, 百, 千, 万'
  174. all_smaller_units = zip(
  175. SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL)
  176. smaller_units = [CNU.create(i, v, small_unit=True)
  177. for i, v in enumerate(all_smaller_units)]
  178. # digis
  179. chinese_digis = zip(CHINESE_DIGIS, CHINESE_DIGIS,
  180. BIG_CHINESE_DIGIS_SIMPLIFIED, BIG_CHINESE_DIGIS_TRADITIONAL)
  181. digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)]
  182. digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT
  183. digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT
  184. digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1]
  185. # symbols
  186. positive_cn = CM(POSITIVE[0], POSITIVE[1], '+', lambda x: x)
  187. negative_cn = CM(NEGATIVE[0], NEGATIVE[1], '-', lambda x: -x)
  188. point_cn = CM(POINT[0], POINT[1], '.', lambda x,
  189. y: float(str(x) + '.' + str(y)))
  190. # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y)))
  191. system = NumberSystem()
  192. system.units = smaller_units + larger_units
  193. system.digits = digits
  194. system.math = MathSymbol(positive_cn, negative_cn, point_cn)
  195. # system.symbols = OtherSymbol(sil_cn)
  196. return system
  197. def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]):
  198. def get_symbol(char, system):
  199. for u in system.units:
  200. if char in [u.traditional, u.simplified, u.big_s, u.big_t]:
  201. return u
  202. for d in system.digits:
  203. if char in [d.traditional, d.simplified, d.big_s, d.big_t, d.alt_s, d.alt_t]:
  204. return d
  205. for m in system.math:
  206. if char in [m.traditional, m.simplified]:
  207. return m
  208. def string2symbols(chinese_string, system):
  209. int_string, dec_string = chinese_string, ''
  210. for p in [system.math.point.simplified, system.math.point.traditional]:
  211. if p in chinese_string:
  212. int_string, dec_string = chinese_string.split(p)
  213. break
  214. return [get_symbol(c, system) for c in int_string], \
  215. [get_symbol(c, system) for c in dec_string]
  216. def correct_symbols(integer_symbols, system):
  217. """
  218. 一百八 to 一百八十
  219. 一亿一千三百万 to 一亿 一千万 三百万
  220. """
  221. if integer_symbols and isinstance(integer_symbols[0], CNU):
  222. if integer_symbols[0].power == 1:
  223. integer_symbols = [system.digits[1]] + integer_symbols
  224. if len(integer_symbols) > 1:
  225. if isinstance(integer_symbols[-1], CND) and isinstance(integer_symbols[-2], CNU):
  226. integer_symbols.append(
  227. CNU(integer_symbols[-2].power - 1, None, None, None, None))
  228. result = []
  229. unit_count = 0
  230. for s in integer_symbols:
  231. if isinstance(s, CND):
  232. result.append(s)
  233. unit_count = 0
  234. elif isinstance(s, CNU):
  235. current_unit = CNU(s.power, None, None, None, None)
  236. unit_count += 1
  237. if unit_count == 1:
  238. result.append(current_unit)
  239. elif unit_count > 1:
  240. for i in range(len(result)):
  241. if isinstance(result[-i - 1], CNU) and result[-i - 1].power < current_unit.power:
  242. result[-i - 1] = CNU(result[-i - 1].power +
  243. current_unit.power, None, None, None, None)
  244. return result
  245. def compute_value(integer_symbols):
  246. """
  247. Compute the value.
  248. When current unit is larger than previous unit, current unit * all previous units will be used as all previous units.
  249. e.g. '两千万' = 2000 * 10000 not 2000 + 10000
  250. """
  251. value = [0]
  252. last_power = 0
  253. for s in integer_symbols:
  254. if isinstance(s, CND):
  255. value[-1] = s.value
  256. elif isinstance(s, CNU):
  257. value[-1] *= pow(10, s.power)
  258. if s.power > last_power:
  259. value[:-1] = list(map(lambda v: v *
  260. pow(10, s.power), value[:-1]))
  261. last_power = s.power
  262. value.append(0)
  263. return sum(value)
  264. system = create_system(numbering_type)
  265. int_part, dec_part = string2symbols(chinese_string, system)
  266. int_part = correct_symbols(int_part, system)
  267. int_str = str(compute_value(int_part))
  268. dec_str = ''.join([str(d.value) for d in dec_part])
  269. if dec_part:
  270. return '{0}.{1}'.format(int_str, dec_str)
  271. else:
  272. return int_str
  273. def num2chn(number_string, numbering_type=NUMBERING_TYPES[1], big=False,
  274. traditional=False, alt_zero=False, alt_one=False, alt_two=True,
  275. use_zeros=True, use_units=True):
  276. def get_value(value_string, use_zeros=True):
  277. striped_string = value_string.lstrip('0')
  278. # record nothing if all zeros
  279. if not striped_string:
  280. return []
  281. # record one digits
  282. elif len(striped_string) == 1:
  283. if use_zeros and len(value_string) != len(striped_string):
  284. return [system.digits[0], system.digits[int(striped_string)]]
  285. else:
  286. return [system.digits[int(striped_string)]]
  287. # recursively record multiple digits
  288. else:
  289. result_unit = next(u for u in reversed(
  290. system.units) if u.power < len(striped_string))
  291. result_string = value_string[:-result_unit.power]
  292. return get_value(result_string) + [result_unit] + get_value(striped_string[-result_unit.power:])
  293. system = create_system(numbering_type)
  294. int_dec = number_string.split('.')
  295. if len(int_dec) == 1:
  296. int_string = int_dec[0]
  297. dec_string = ""
  298. elif len(int_dec) == 2:
  299. int_string = int_dec[0]
  300. dec_string = int_dec[1]
  301. else:
  302. raise ValueError(
  303. "invalid input num string with more than one dot: {}".format(number_string))
  304. if use_units and len(int_string) > 1:
  305. result_symbols = get_value(int_string)
  306. else:
  307. result_symbols = [system.digits[int(c)] for c in int_string]
  308. dec_symbols = [system.digits[int(c)] for c in dec_string]
  309. if dec_string:
  310. result_symbols += [system.math.point] + dec_symbols
  311. if alt_two:
  312. liang = CND(2, system.digits[2].alt_s, system.digits[2].alt_t,
  313. system.digits[2].big_s, system.digits[2].big_t)
  314. for i, v in enumerate(result_symbols):
  315. if isinstance(v, CND) and v.value == 2:
  316. next_symbol = result_symbols[i +
  317. 1] if i < len(result_symbols) - 1 else None
  318. previous_symbol = result_symbols[i - 1] if i > 0 else None
  319. if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))):
  320. if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)):
  321. result_symbols[i] = liang
  322. # if big is True, '两' will not be used and `alt_two` has no impact on output
  323. if big:
  324. attr_name = 'big_'
  325. if traditional:
  326. attr_name += 't'
  327. else:
  328. attr_name += 's'
  329. else:
  330. if traditional:
  331. attr_name = 'traditional'
  332. else:
  333. attr_name = 'simplified'
  334. result = ''.join([getattr(s, attr_name) for s in result_symbols])
  335. # if not use_zeros:
  336. # result = result.strip(getattr(system.digits[0], attr_name))
  337. if alt_zero:
  338. result = result.replace(
  339. getattr(system.digits[0], attr_name), system.digits[0].alt_s)
  340. if alt_one:
  341. result = result.replace(
  342. getattr(system.digits[1], attr_name), system.digits[1].alt_s)
  343. for i, p in enumerate(POINT):
  344. if result.startswith(p):
  345. return CHINESE_DIGIS[0] + result
  346. # ^10, 11, .., 19
  347. if len(result) >= 2 and result[1] in [SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0],
  348. SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0]] and \
  349. result[0] in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]]:
  350. result = result[1:]
  351. return result
  352. # ================================================================================ #
  353. # different types of rewriters
  354. # ================================================================================ #
  355. class Cardinal:
  356. """
  357. CARDINAL类
  358. """
  359. def __init__(self, cardinal=None, chntext=None):
  360. self.cardinal = cardinal
  361. self.chntext = chntext
  362. def chntext2cardinal(self):
  363. return chn2num(self.chntext)
  364. def cardinal2chntext(self):
  365. return num2chn(self.cardinal)
  366. class Digit:
  367. """
  368. DIGIT类
  369. """
  370. def __init__(self, digit=None, chntext=None):
  371. self.digit = digit
  372. self.chntext = chntext
  373. # def chntext2digit(self):
  374. # return chn2num(self.chntext)
  375. def digit2chntext(self):
  376. return num2chn(self.digit, alt_two=False, use_units=False)
  377. class TelePhone:
  378. """
  379. TELEPHONE类
  380. """
  381. def __init__(self, telephone=None, raw_chntext=None, chntext=None):
  382. self.telephone = telephone
  383. self.raw_chntext = raw_chntext
  384. self.chntext = chntext
  385. # def chntext2telephone(self):
  386. # sil_parts = self.raw_chntext.split('<SIL>')
  387. # self.telephone = '-'.join([
  388. # str(chn2num(p)) for p in sil_parts
  389. # ])
  390. # return self.telephone
  391. def telephone2chntext(self, fixed=False):
  392. if fixed:
  393. sil_parts = self.telephone.split('-')
  394. self.raw_chntext = '<SIL>'.join([
  395. num2chn(part, alt_two=False, use_units=False) for part in sil_parts
  396. ])
  397. self.chntext = self.raw_chntext.replace('<SIL>', '')
  398. else:
  399. sp_parts = self.telephone.strip('+').split()
  400. self.raw_chntext = '<SP>'.join([
  401. num2chn(part, alt_two=False, use_units=False) for part in sp_parts
  402. ])
  403. self.chntext = self.raw_chntext.replace('<SP>', '')
  404. return self.chntext
  405. class Fraction:
  406. """
  407. FRACTION类
  408. """
  409. def __init__(self, fraction=None, chntext=None):
  410. self.fraction = fraction
  411. self.chntext = chntext
  412. def chntext2fraction(self):
  413. denominator, numerator = self.chntext.split('分之')
  414. return chn2num(numerator) + '/' + chn2num(denominator)
  415. def fraction2chntext(self):
  416. numerator, denominator = self.fraction.split('/')
  417. return num2chn(denominator) + '分之' + num2chn(numerator)
  418. class Date:
  419. """
  420. DATE类
  421. """
  422. def __init__(self, date=None, chntext=None):
  423. self.date = date
  424. self.chntext = chntext
  425. # def chntext2date(self):
  426. # chntext = self.chntext
  427. # try:
  428. # year, other = chntext.strip().split('年', maxsplit=1)
  429. # year = Digit(chntext=year).digit2chntext() + '年'
  430. # except ValueError:
  431. # other = chntext
  432. # year = ''
  433. # if other:
  434. # try:
  435. # month, day = other.strip().split('月', maxsplit=1)
  436. # month = Cardinal(chntext=month).chntext2cardinal() + '月'
  437. # except ValueError:
  438. # day = chntext
  439. # month = ''
  440. # if day:
  441. # day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1]
  442. # else:
  443. # month = ''
  444. # day = ''
  445. # date = year + month + day
  446. # self.date = date
  447. # return self.date
  448. def date2chntext(self):
  449. date = self.date
  450. try:
  451. year, other = date.strip().split('年', 1)
  452. year = Digit(digit=year).digit2chntext() + '年'
  453. except ValueError:
  454. other = date
  455. year = ''
  456. if other:
  457. try:
  458. month, day = other.strip().split('月', 1)
  459. month = Cardinal(cardinal=month).cardinal2chntext() + '月'
  460. except ValueError:
  461. day = date
  462. month = ''
  463. if day:
  464. day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1]
  465. else:
  466. month = ''
  467. day = ''
  468. chntext = year + month + day
  469. self.chntext = chntext
  470. return self.chntext
  471. class Money:
  472. """
  473. MONEY类
  474. """
  475. def __init__(self, money=None, chntext=None):
  476. self.money = money
  477. self.chntext = chntext
  478. # def chntext2money(self):
  479. # return self.money
  480. def money2chntext(self):
  481. money = self.money
  482. pattern = re.compile(r'(\d+(\.\d+)?)')
  483. matchers = pattern.findall(money)
  484. if matchers:
  485. for matcher in matchers:
  486. money = money.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext())
  487. self.chntext = money
  488. return self.chntext
  489. class Percentage:
  490. """
  491. PERCENTAGE类
  492. """
  493. def __init__(self, percentage=None, chntext=None):
  494. self.percentage = percentage
  495. self.chntext = chntext
  496. def chntext2percentage(self):
  497. return chn2num(self.chntext.strip().strip('百分之')) + '%'
  498. def percentage2chntext(self):
  499. return '百分之' + num2chn(self.percentage.strip().strip('%'))
  500. def remove_erhua(text, er_whitelist):
  501. """
  502. 去除儿化音词中的儿:
  503. 他女儿在那边儿 -> 他女儿在那边
  504. """
  505. er_pattern = re.compile(er_whitelist)
  506. new_str=''
  507. while re.search('儿',text):
  508. a = re.search('儿',text).span()
  509. remove_er_flag = 0
  510. if er_pattern.search(text):
  511. b = er_pattern.search(text).span()
  512. if b[0] <= a[0]:
  513. remove_er_flag = 1
  514. if remove_er_flag == 0 :
  515. new_str = new_str + text[0:a[0]]
  516. text = text[a[1]:]
  517. else:
  518. new_str = new_str + text[0:b[1]]
  519. text = text[b[1]:]
  520. text = new_str + text
  521. return text
  522. # ================================================================================ #
  523. # NSW Normalizer
  524. # ================================================================================ #
  525. class NSWNormalizer:
  526. def __init__(self, raw_text):
  527. self.raw_text = '^' + raw_text + '$'
  528. self.norm_text = ''
  529. def _particular(self):
  530. text = self.norm_text
  531. pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))")
  532. matchers = pattern.findall(text)
  533. if matchers:
  534. # print('particular')
  535. for matcher in matchers:
  536. text = text.replace(matcher[0], matcher[1]+'2'+matcher[2], 1)
  537. self.norm_text = text
  538. return self.norm_text
  539. def normalize(self):
  540. text = self.raw_text
  541. # 规范化日期
  542. pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)")
  543. matchers = pattern.findall(text)
  544. if matchers:
  545. #print('date')
  546. for matcher in matchers:
  547. text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1)
  548. # 规范化金钱
  549. pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)")
  550. matchers = pattern.findall(text)
  551. if matchers:
  552. #print('money')
  553. for matcher in matchers:
  554. text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1)
  555. # 规范化固话/手机号码
  556. # 手机
  557. # http://www.jihaoba.com/news/show/13680
  558. # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
  559. # 联通:130、131、132、156、155、186、185、176
  560. # 电信:133、153、189、180、181、177
  561. pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D")
  562. matchers = pattern.findall(text)
  563. if matchers:
  564. #print('telephone')
  565. for matcher in matchers:
  566. text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1)
  567. # 固话
  568. pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D")
  569. matchers = pattern.findall(text)
  570. if matchers:
  571. # print('fixed telephone')
  572. for matcher in matchers:
  573. text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1)
  574. # 规范化分数
  575. pattern = re.compile(r"(\d+/\d+)")
  576. matchers = pattern.findall(text)
  577. if matchers:
  578. #print('fraction')
  579. for matcher in matchers:
  580. text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1)
  581. # 规范化百分数
  582. text = text.replace('%', '%')
  583. pattern = re.compile(r"(\d+(\.\d+)?%)")
  584. matchers = pattern.findall(text)
  585. if matchers:
  586. #print('percentage')
  587. for matcher in matchers:
  588. text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1)
  589. # 规范化纯数+量词
  590. pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS)
  591. matchers = pattern.findall(text)
  592. if matchers:
  593. #print('cardinal+quantifier')
  594. for matcher in matchers:
  595. text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
  596. # 规范化数字编号
  597. pattern = re.compile(r"(\d{4,32})")
  598. matchers = pattern.findall(text)
  599. if matchers:
  600. #print('digit')
  601. for matcher in matchers:
  602. text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1)
  603. # 规范化纯数
  604. pattern = re.compile(r"(\d+(\.\d+)?)")
  605. matchers = pattern.findall(text)
  606. if matchers:
  607. #print('cardinal')
  608. for matcher in matchers:
  609. text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
  610. self.norm_text = text
  611. self._particular()
  612. return self.norm_text.lstrip('^').rstrip('$')
  613. def nsw_test_case(raw_text):
  614. print('I:' + raw_text)
  615. print('O:' + NSWNormalizer(raw_text).normalize())
  616. print('')
  617. def nsw_test():
  618. nsw_test_case('固话:0595-23865596或23880880。')
  619. nsw_test_case('固话:0595-23865596或23880880。')
  620. nsw_test_case('手机:+86 19859213959或15659451527。')
  621. nsw_test_case('分数:32477/76391。')
  622. nsw_test_case('百分数:80.03%。')
  623. nsw_test_case('编号:31520181154418。')
  624. nsw_test_case('纯数:2983.07克或12345.60米。')
  625. nsw_test_case('日期:1999年2月20日或09年3月15号。')
  626. nsw_test_case('金钱:12块5,34.5元,20.1万')
  627. nsw_test_case('特殊:O2O或B2C。')
  628. nsw_test_case('3456万吨')
  629. nsw_test_case('2938个')
  630. nsw_test_case('938')
  631. nsw_test_case('今天吃了115个小笼包231个馒头')
  632. nsw_test_case('有62%的概率')
  633. if __name__ == '__main__':
  634. #nsw_test()
  635. p = argparse.ArgumentParser()
  636. p.add_argument('ifile', help='input filename, assume utf-8 encoding')
  637. p.add_argument('ofile', help='output filename')
  638. p.add_argument('--to_upper', action='store_true', help='convert to upper case')
  639. p.add_argument('--to_lower', action='store_true', help='convert to lower case')
  640. p.add_argument('--has_key', action='store_true', help="input text has Kaldi's key as first field.")
  641. p.add_argument('--remove_fillers', type=bool, default=True, help='remove filler chars such as "呃, 啊"')
  642. p.add_argument('--remove_erhua', type=bool, default=True, help='remove erhua chars such as "这儿"')
  643. p.add_argument('--log_interval', type=int, default=10000, help='log interval in number of processed lines')
  644. args = p.parse_args()
  645. ifile = codecs.open(args.ifile, 'r', 'utf8')
  646. ofile = codecs.open(args.ofile, 'w+', 'utf8')
  647. n = 0
  648. for l in ifile:
  649. key = ''
  650. text = ''
  651. if args.has_key:
  652. cols = l.split(maxsplit=1)
  653. key = cols[0]
  654. if len(cols) == 2:
  655. text = cols[1].strip()
  656. else:
  657. text = ''
  658. else:
  659. text = l.strip()
  660. # cases
  661. if args.to_upper and args.to_lower:
  662. sys.stderr.write('text norm: to_upper OR to_lower?')
  663. exit(1)
  664. if args.to_upper:
  665. text = text.upper()
  666. if args.to_lower:
  667. text = text.lower()
  668. # Filler chars removal
  669. if args.remove_fillers:
  670. for ch in FILLER_CHARS:
  671. text = text.replace(ch, '')
  672. if args.remove_erhua:
  673. text = remove_erhua(text, ER_WHITELIST)
  674. # NSW(Non-Standard-Word) normalization
  675. text = NSWNormalizer(text).normalize()
  676. # Punctuations removal
  677. old_chars = CHINESE_PUNC_LIST + string.punctuation # includes all CN and EN punctuations
  678. new_chars = ' ' * len(old_chars)
  679. del_chars = ''
  680. text = text.translate(str.maketrans(old_chars, new_chars, del_chars))
  681. #
  682. if args.has_key:
  683. ofile.write(key + '\t' + text + '\n')
  684. else:
  685. ofile.write(text + '\n')
  686. n += 1
  687. if n % args.log_interval == 0:
  688. sys.stderr.write("text norm: {} lines done.\n".format(n))
  689. sys.stderr.write("text norm: {} lines done in total.\n".format(n))
  690. ifile.close()
  691. ofile.close()