weak-and算法原理演示(wand)
推荐一个在信息检索中用到的weak-and算法,这个算法在广告系统中有成熟的应用。#!/usr/bin/python#wangben updated 20130108class WAND:'''implement wand algorithm'''def __init__(self, InvertIndex, last_docid):self.invert_index = InvertIndex #InvertIndex: term -> docid1, docid2, docid3 ...self.current_doc = 0self.current_invert_index = {}self.query_terms = []self.threshold = 2self.sort_terms = []self.LastID = 2000000000 #big numself.debug_count = 0self.last_docid = last_dociddef __InitQuery(self, query_terms):'''check terms len > 0'''self.current_doc = -1self.current_invert_index.clear()self.query_terms = query_termsself.sort_terms[:] = []self.debug_count = 0for term in query_terms:#initial start pos from the first position of term's invert_indexself.current_invert_index[term] = [ self.invert_index[term][0], 0 ] #[ docid, index ]def __SortTerms(self):if len(self.sort_terms) == 0:for term in self.query_terms:if term in self.current_invert_index:doc_id = self.current_invert_index[term][0]self.sort_terms.append([ int(doc_id), term ])self.sort_terms.sort()def __PickTerm(self, pivot_index):return 0def __FindPivotTerm(self):score = 0for i in range(0, len(self.sort_terms)):score += 1if score >= self.threshold:return [ self.sort_terms[i][1], i]return [ None, len(self.sort_terms) ]def __IteratorInvertIndex(self, change_term, docid, pos):'''move to doc id > docid'''doc_list = self.invert_index[change_term]i = 0for i in range(pos, len(doc_list)):if doc_list[i] >= docid:pos = idocid = doc_list[i]breakreturn [ docid, pos ]def __AdvanceTerm(self, change_index, docid ):change_term = self.sort_terms[change_index][1]pos = self.current_invert_index[change_term][1](new_doc, new_pos) = \self.__IteratorInvertIndex(change_term, docid, pos)self.current_invert_index[change_term] = \[ new_doc , new_pos ]self.sort_terms[change_index][0] = new_docdef __Next(self):if self.last_docid == self.current_doc:return Nonewhile True:self.debug_count += 1#sort terms by doc idself.__SortTerms()#find pivot term > threshold(pivot_term, pivot_index) = self.__FindPivotTerm()if pivot_term == None:#no more candidatereturn None#debug_info:for i in range(0, pivot_index + 1):print self.sort_terms[i][0],self.sort_terms[i][1],"|",print ""pivot_doc_id = self.current_invert_index[pivot_term][0]if pivot_doc_id == self.LastID: #!!return Noneif pivot_doc_id <= self.current_doc:change_index = self.__PickTerm(pivot_index)self.__AdvanceTerm( change_index, self.current_doc + 1 )else:first_docid = self.sort_terms[0][0]if pivot_doc_id == first_docid:self.current_doc = pivot_doc_idreturn self.current_docelse:#pick all preceding termfor i in range(0, pivot_index):change_index = iself.__AdvanceTerm( change_index, pivot_doc_id )def DoQuery(self, query_terms):self.__InitQuery(query_terms)while True:candidate_docid = self.__Next()if candidate_docid == None:breakprint "candidate_docid:",candidate_docid#insert candidate_docid to heap#update thresholdprint "debug_count:",self.debug_countif __name__ == "__main__":testIndex = {}testIndex["t1"] = [ 0, 1, 2, 3, 6 , 2000000000]testIndex["t2"] = [ 3, 4, 5, 6, 2000000000 ]testIndex["t3"] = [ 2, 5, 2000000000 ]testIndex["t4"] = [ 4, 6, 2000000000 ]w = WAND(testIndex, 6)w.DoQuery(["t1", "t2", "t3", "t4"])
输出结果中会展示next中循环的次数,以及最后被选为candidate的docid这里省略了建立堆的过程,使用了一个默认阈值2作为doc的删选条件,候选doc和query doc采用重复词的个数计算UB,这里只是一个算法演示,实际使用的时候需要根据自己的相关性公式进行调整