python正向最大匹配分词和逆向最大匹配分词的实例_Python

python正向最大匹配分词和逆向最大匹配分词的实例

2021-04-19 00:15yan456jie Python

今天小编就为大家分享一篇python正向最大匹配分词和逆向最大匹配分词的实例，具有很好的参考价值，希望对大家有所帮助。一起跟随小编过来看看吧

正向最大匹配

				?

									# -*- coding:utf-8 -*-

									CODEC='utf-8'

									def u(s, encoding):

									  'converted other encoding to unicode encoding'

									  if isinstance(s, unicode):

									    return s

									  else:

									    return unicode(s, encoding)

									def fwd_mm_seg(wordDict, maxLen, str):

									  'forward max match segment'

									  wordList = []

									  segStr = str

									  segStrLen = len(segStr)

									  for word in wordDict:

									    print 'word: ', word

									  print "\n"

									  while segStrLen > 0:

									    if segStrLen > maxLen:

									      wordLen = maxLen

									    else:

									      wordLen = segStrLen

									    subStr = segStr[0:wordLen]

									    print "subStr: ", subStr

									    while wordLen > 1:

									      if subStr in wordDict:

									        print "subStr1: %r" % subStr

									        break

									      else:

									        print "subStr2: %r" % subStr

									        wordLen = wordLen - 1

									        subStr = subStr[0:wordLen]

									#      print "subStr3: ", subStr

									    wordList.append(subStr)

									    segStr = segStr[wordLen:]

									    segStrLen = segStrLen - wordLen

									  for wordstr in wordList:

									    print "wordstr: ", wordstr

									  return wordList

									def main():

									  fp_dict = open('words.dic')

									  wordDict = {}

									  for eachWord in fp_dict:

									    wordDict[u(eachWord.strip(), 'utf-8')] = 1

									  segStr = u'你好世界hello world'

									  print segStr

									  wordList = fwd_mm_seg(wordDict, 10, segStr)

									  print "==".join(wordList)

									if __name__ == '__main__':

									  main()

逆向最大匹配

				?

									# -*- coding:utf-8 -*-

									def u(s, encoding):

									  'converted other encoding to unicode encoding'

									  if isinstance(s, unicode):

									    return s

									  else:

									    return unicode(s, encoding)

									CODEC='utf-8'

									def bwd_mm_seg(wordDict, maxLen, str):

									  'forward max match segment'

									  wordList = []

									  segStr = str

									  segStrLen = len(segStr)

									  for word in wordDict:

									    print 'word: ', word

									  print "\n"

									  while segStrLen > 0:

									    if segStrLen > maxLen:

									      wordLen = maxLen

									    else:

									      wordLen = segStrLen

									    subStr = segStr[-wordLen:None]

									    print "subStr: ", subStr

									    while wordLen > 1:

									      if subStr in wordDict:

									        print "subStr1: %r" % subStr

									        break

									      else:

									        print "subStr2: %r" % subStr

									        wordLen = wordLen - 1

									        subStr = subStr[-wordLen:None]

									#      print "subStr3: ", subStr

									    wordList.append(subStr)

									    segStr = segStr[0: -wordLen]

									    segStrLen = segStrLen - wordLen

									  wordList.reverse()

									  for wordstr in wordList:

									    print "wordstr: ", wordstr

									  return wordList

									def main():

									  fp_dict = open('words.dic')

									  wordDict = {}

									  for eachWord in fp_dict:

									    wordDict[u(eachWord.strip(), 'utf-8')] = 1

									  segStr = ur'你好世界hello world'

									  print segStr

									  wordList = bwd_mm_seg(wordDict, 10, segStr)

									  print "==".join(wordList)

									if __name__ == '__main__':

									  main()