谁能帮忙分析一下这个算法,布隆过滤器的变种
大概说明:
这段代码摘自一个实时的搜索引擎zoie中的类DocIDMapperImpl的构造函数。
主要功能就是实现docArray和uidArray之间的双向映射。即可以根据uid找到docId,或者docId找到uid。
问题:
1._filter是变种的布隆过滤器?没看懂其实现
2.partitionedUidArray 为啥对其中的数据部分排序?
class TestDocIdMapper { final int[] _docArray; final long[] _uidArray; final int[] _start; final long[] _filter; final int _mask; final int MIXER = 2147482951; // 一个素数 TestDocIdMapper(long[] uidArray) { int len = uidArray.length; int mask = len / 4; mask |= (mask >> 1); mask |= (mask >> 2); mask |= (mask >> 4); mask |= (mask >> 8); mask |= (mask >> 16); _mask = mask; //为什么采用这种方式获取filter的长度? _filter = new long[mask + 1]; for (long uid : uidArray) { if (uid != Long.MIN_VALUE) { int h = (int) ((uid >>> 32) ^ uid) * MIXER; //为什么这样求h? long bits = _filter[h & _mask]; bits |= ((1L << (h >>> 26))); bits |= ((1L << ((h >> 20) & 0x3F))); //这两个或的目的?为什么移位采用26位和20位 _filter[h & _mask] = bits; } } // _filter的目的就是getDocID方法中快速判断是否存在uid _start = new int[_mask + 1 + 1]; len = 0; for (long uid : uidArray) { if (uid != Long.MIN_VALUE) { _start[((int) ((uid >>> 32) ^ uid) * MIXER) & _mask]++; len++; } } int val = 0; for (int i = 0; i < _start.length; i++) { val += _start[i]; _start[i] = val; } _start[_mask] = len; long[] partitionedUidArray = new long[len]; int[] docArray = new int[len]; for (long uid : uidArray) { if (uid != Long.MIN_VALUE) { int i = --(_start[((int) ((uid >>> 32) ^ uid) * MIXER) & _mask]); partitionedUidArray[i] = uid; } } int s = _start[0]; for (int i = 1; i < _start.length; i++) { int e = _start[i]; if (s < e) { Arrays.sort(partitionedUidArray, s, e); } s = e; } for (int docid = 0; docid < uidArray.length; docid++) { long uid = uidArray[docid]; if (uid != Long.MIN_VALUE) { final int p = ((int) ((uid >>> 32) ^ uid) * MIXER) & _mask; int idx = findIndex(partitionedUidArray, uid, _start[p], _start[p + 1]); if (idx >= 0) { docArray[idx] = docid; } } } _uidArray = partitionedUidArray; _docArray = docArray; } private final int findIndex(final long[] arr, final long uid, int begin, int end) { if (begin >= end) return -1; end--; while (true) { int mid = (begin + end) >>> 1; long midval = arr[mid]; if (midval == uid) return mid; if (mid == end) return -1; if (midval < uid) begin = mid + 1; else end = mid; } } //根据uid获取docId的方法 public int getDocID(final long uid) { final int h = (int) ((uid >>> 32) ^ uid) * MIXER; final int p = h & _mask; // 首先在过滤器中判断uid是否存在,如果不存在就不查找 final long bits = _filter[p]; if ((bits & (1L << (h >>> 26))) == 0 || (bits & (1L << ((h >> 20) & 0x3F))) == 0) return -1; // do binary search in the partition int begin = _start[p]; int end = _start[p + 1] - 1; // we have some uids in this partition, so we assume (begin <= end) while (true) { int mid = (begin + end) >>> 1; long midval = _uidArray[mid]; if (midval == uid) return _docArray[mid]; if (mid == end) return -1; if (midval < uid) begin = mid + 1; else end = mid; } } }