我需要转换one
成1
,two
转入2
等等.
有没有办法用库或类或任何东西来做到这一点?
这段代码的大部分是设置numwords dict,它只在第一次调用时完成.
def text2int(textnum, numwords={}): if not numwords: units = [ "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", ] tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"] scales = ["hundred", "thousand", "million", "billion", "trillion"] numwords["and"] = (1, 0) for idx, word in enumerate(units): numwords[word] = (1, idx) for idx, word in enumerate(tens): numwords[word] = (1, idx * 10) for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0) current = result = 0 for word in textnum.split(): if word not in numwords: raise Exception("Illegal word: " + word) scale, increment = numwords[word] current = current * scale + increment if scale > 100: result += current current = 0 return result + current print text2int("seven billion one hundred million thirty one thousand three hundred thirty seven") #7100031337
如果有人有兴趣,我会修改一个维护字符串其余部分的版本(虽然它可能有bug,但没有测试过多).
def text2int (textnum, numwords={}): if not numwords: units = [ "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", ] tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"] scales = ["hundred", "thousand", "million", "billion", "trillion"] numwords["and"] = (1, 0) for idx, word in enumerate(units): numwords[word] = (1, idx) for idx, word in enumerate(tens): numwords[word] = (1, idx * 10) for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0) ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12} ordinal_endings = [('ieth', 'y'), ('th', '')] textnum = textnum.replace('-', ' ') current = result = 0 curstring = "" onnumber = False for word in textnum.split(): if word in ordinal_words: scale, increment = (1, ordinal_words[word]) current = current * scale + increment if scale > 100: result += current current = 0 onnumber = True else: for ending, replacement in ordinal_endings: if word.endswith(ending): word = "%s%s" % (word[:-len(ending)], replacement) if word not in numwords: if onnumber: curstring += repr(result + current) + " " curstring += word + " " result = current = 0 onnumber = False else: scale, increment = numwords[word] current = current * scale + increment if scale > 100: result += current current = 0 onnumber = True if onnumber: curstring += repr(result + current) return curstring
例:
>>> text2int("I want fifty five hot dogs for two hundred dollars.") I want 55 hot dogs for 200 dollars.
如果您有"200美元",可能会出现问题.但是,这真的非常粗糙.
我刚刚向PyPI发布了一个名为word2number的python模块,用于确切的目的.https://github.com/akshaynagpal/w2n
安装使用:
pip install word2number
确保您的点子更新到最新版本.
用法:
from word2number import w2n print w2n.word_to_num("two million three thousand nine hundred and eighty four") 2003984
感谢代码片段...给我节省了很多时间!
我需要处理一些额外的解析案例,例如序数词("第一","第二"),带连字符的词("一百")和带连字符的序数词("第五十七"),所以我添加了几行:
def text2int(textnum, numwords={}): if not numwords: units = [ "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", ] tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"] scales = ["hundred", "thousand", "million", "billion", "trillion"] numwords["and"] = (1, 0) for idx, word in enumerate(units): numwords[word] = (1, idx) for idx, word in enumerate(tens): numwords[word] = (1, idx * 10) for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0) ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12} ordinal_endings = [('ieth', 'y'), ('th', '')] textnum = textnum.replace('-', ' ') current = result = 0 for word in textnum.split(): if word in ordinal_words: scale, increment = (1, ordinal_words[word]) else: for ending, replacement in ordinal_endings: if word.endswith(ending): word = "%s%s" % (word[:-len(ending)], replacement) if word not in numwords: raise Exception("Illegal word: " + word) scale, increment = numwords[word] current = current * scale + increment if scale > 100: result += current current = 0 return result + current`
我需要一些不同的东西,因为我的输入是从语音到文本的转换,解决方案并不总是将数字相加。例如,“我的邮政编码是一二三四五”不应转换为“我的邮政编码是15”。
我接受了安德鲁的回答,并对其进行了调整,以处理人们强调为错误的其他一些情况,并且还增加了对示例的支持,例如我上面提到的邮政编码。下面显示了一些基本的测试用例,但我确定仍有改进的空间。
def is_number(x): if type(x) == str: x = x.replace(',', '') try: float(x) except: return False return True def text2int (textnum, numwords={}): units = [ 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', ] tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety'] scales = ['hundred', 'thousand', 'million', 'billion', 'trillion'] ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12} ordinal_endings = [('ieth', 'y'), ('th', '')] if not numwords: numwords['and'] = (1, 0) for idx, word in enumerate(units): numwords[word] = (1, idx) for idx, word in enumerate(tens): numwords[word] = (1, idx * 10) for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0) textnum = textnum.replace('-', ' ') current = result = 0 curstring = '' onnumber = False lastunit = False lastscale = False def is_numword(x): if is_number(x): return True if word in numwords: return True return False def from_numword(x): if is_number(x): scale = 0 increment = int(x.replace(',', '')) return scale, increment return numwords[x] for word in textnum.split(): if word in ordinal_words: scale, increment = (1, ordinal_words[word]) current = current * scale + increment if scale > 100: result += current current = 0 onnumber = True lastunit = False lastscale = False else: for ending, replacement in ordinal_endings: if word.endswith(ending): word = "%s%s" % (word[:-len(ending)], replacement) if (not is_numword(word)) or (word == 'and' and not lastscale): if onnumber: # Flush the current number we are building curstring += repr(result + current) + " " curstring += word + " " result = current = 0 onnumber = False lastunit = False lastscale = False else: scale, increment = from_numword(word) onnumber = True if lastunit and (word not in scales): # Assume this is part of a string of individual numbers to # be flushed, such as a zipcode "one two three four five" curstring += repr(result + current) result = current = 0 if scale > 1: current = max(1, current) current = current * scale + increment if scale > 100: result += current current = 0 lastscale = False lastunit = False if word in scales: lastscale = True elif word in units: lastunit = True if onnumber: curstring += repr(result + current) return curstring
一些测试...
one two three -> 123 three forty five -> 345 three and forty five -> 3 and 45 three hundred and forty five -> 345 three hundred -> 300 twenty five hundred -> 2500 three thousand and six -> 3006 three thousand six -> 3006 nineteenth -> 19 twentieth -> 20 first -> 1 my zip is one two three four five -> my zip is 12345 nineteen ninety six -> 1996 fifty-seventh -> 57 one million -> 1000000 first hundred -> 100 I will buy the first thousand -> I will buy the 1000 # probably should leave ordinal in the string thousand -> 1000 hundred and six -> 106 1 million -> 1000000