初学者报道(2):实现 1-gram分词算法
by ricky
at 2011-07-19 13:21:26
original http://feedproxy.google.com/~r/52nlp/~3/gGQv4B7dJY4/%e5%88%9d%e5%ad%a6%e8%80%85%e6%8a%a5%e9%81%93%ef%bc%882%ef%bc%89%ef%bc%9a%e5%ae%9e%e7%8e%b0-1-gram%e5%88%86%e8%af%8d%e7%ae%97%e6%b3%95
写了个1-gram的分词算法实现:
借鉴了之前在这个blog上看到的n-gram算法中的split函数的写法,其他部分自己写的。
Dictionary.py:
class Dictionary: 'Dictionary Loading and Management' def __init__(self,dicname): self.dictMap={} self.N = 0; dictfile = open(dicname,'r') for eachLine in dictfile: dictstr = eachLine.decode("cp936") strlist = dictstr.split("t",2) self.dictMap[strlist[0]] = strlist[1].split("n",1)[0] self.N+=int(self.dictMap[strlist[0]]) dictfile.close() print self.N def getCount(self,wordname): if(self.dictMap.has_key(wordname)): return int(self.dictMap[wordname]) else: return 0.5;#如果词典中没有,这个词的出现次数被定为 0.5 def getPvalue(self,wordname): return float(self.getCount(wordname))/self.N def isAWord(self,word): return self.dictMap.has_key(word) if __name__=='__main__': dict1=Dictionary("dict.txt")
class Ngram: def __init__(self,dictionary): self.mDict=dictionary self.wordList=() self.valueMap = {} self.segMap={} def splitsentence(self,sentence): wordlist = [] for eachNum in range(len(sentence)): wordlist.append((sentence[:eachNum+1],sentence[eachNum+1:])) return wordlist def maxP(self, sentence): if(len(sentence)<=1): return self.mDict.getPvalue(sentence) SenSplitList = self.splitsentence(sentence); maxPvalue = 0; wordPair = []; wordP = 0; for eachPair in SenSplitList: if(len(eachPair[0])>0 and len(eachPair[1])>0): p1=0; p2=0 if(self.valueMap.has_key(eachPair[0])): p1=self.valueMap[eachPair[0]] else: p1=self.maxP(eachPair[0]) if(self.valueMap.has_key(eachPair[1])): p2=self.valueMap[eachPair[1]] else: p2=self.maxP(eachPair[1]) wordP=p1*p2 if(maxPvalue<wordP): maxPvalue = wordP wordPair = eachPair v=self.mDict.getPvalue(sentence) if((v)>maxPvalue and self.mDict.isAWord(sentence)): self.valueMap[sentence]=v self.segMap[sentence]=sentence return v else: self.valueMap[sentence]=maxPvalue self.segMap[sentence]=wordPair return maxPvalue def getSeg(self): return self.segMap if(__name__ =="__main__"): ngram1 = Ngram("dict1") print ngram1.splitsentence("ABC")
from Dictionary import Dictionary from ngram import Ngram def printSeg(segMap,sentence): if(segMap.has_key(sentence)): pair = segMap[sentence] if(isinstance(pair,tuple)): printSeg(segMap,pair[0]) printSeg(segMap,pair[1]) else: if(sentence==pair): print sentence else: printSeg(segMap,pair) else: print sentence dict1 = Dictionary("dict.txt") while(True): ngram1 =Ngram(dict1) sentence = raw_input("please input a Chinese Sentence:").decode("cp936"); print ngram1.maxP(sentence) segmap=ngram1.getSeg() #for eachkey in segmap: # if(isinstance(segmap[eachkey],tuple)): # print (eachkey+":"+segmap[eachkey][0]+','+segmap[eachkey][1]) # else: # print (eachkey+":"+segmap[eachkey]) printSeg(segmap,sentence)
相关文章: