用Java实现搜索引擎布尔运算
索引类:
import java.io.ByteArrayInputStream;import java.io.ByteArrayOutputStream;import java.io.IOException;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;import java.io.Serializable;import java.util.BitSet;import java.util.Collection;import java.util.HashMap;import java.util.Map;import java.util.zip.GZIPInputStream;import java.util.zip.GZIPOutputStream;public class Index implements Serializable {/** * */private static final long serialVersionUID = 7362753433812661741L;private Map<String, BitSet> indexMap;private void writeObject(ObjectOutputStream out) throws IOException {// 压缩ByteArrayOutputStream buf = new ByteArrayOutputStream();ObjectOutputStream objOut = new ObjectOutputStream(new GZIPOutputStream(buf));objOut.writeObject(indexMap);objOut.close();out.writeObject(buf.toByteArray());}@SuppressWarnings("unchecked")private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {byte[] buf = (byte[]) in.readObject();ObjectInputStream objIn = new ObjectInputStream(new GZIPInputStream(new ByteArrayInputStream(buf)));indexMap = (Map<String, BitSet>) objIn.readObject();objIn.close();}public Index(int indexSize) {int initialCapacity = indexSize * 4 / 3;indexMap = new HashMap<String, BitSet>(initialCapacity);}public Index() {this(12);}public void setId(Collection<String> c, int id) {for (String key : c) {BitSet bit = indexMap.get(key);if (bit == null) {bit = new BitSet();indexMap.put(key, bit);}bit.set(id);}}public void setId(String[] c, int id) {for (String key : c) {BitSet bit = indexMap.get(key);if (bit == null) {bit = new BitSet();indexMap.put(key, bit);}bit.set(id);}}public int[] getIdSetWithAnd(String... keys) {checkKeys(keys);int n = keys.length;BitSet[] bits = new BitSet[n];int i = 0;for (String key : keys) {BitSet bit = indexMap.get(key);if (bit != null) {bits[i++] = bit;}}if (i == 0)return null;BitSet bit = (BitSet) bits[0].clone();for (int j = 1; j < i; j++) {bit.and(bits[j]);}return getIdSet(bit);}public int[] getIdSetWithOr(String... keys) {checkKeys(keys);int n = keys.length;BitSet[] bits = new BitSet[n];int i = 0;for (String key : keys) {BitSet bit = indexMap.get(key);if (bit != null) {bits[i++] = bit;}}if (i == 0)return null;BitSet bit = (BitSet) bits[0].clone();for (int j = 1; j < i; j++) {bit.or(bits[j]);}return getIdSet(bit);}private static void checkKeys(String... keys) {if (keys == null)throw new NullPointerException("keys is null.");if (keys.length < 2) {throw new IllegalArgumentException("keys' length is less than 2.");}}public int[] getIdSet(String key) {BitSet bit = indexMap.get(key);if (bit == null)return null;else {return getIdSet(bit);}}private int[] getIdSet(BitSet bit) {int n = bit.size();int[] ids = new int[n];int j = 0;for (int i = 0; i < n; i++) {if (bit.get(i)) {ids[j++] = i;}}if (j == n)return ids;else {int[] arr = new int[j];System.arraycopy(ids, 0, arr, 0, j);return arr;}}}
import java.io.File;import java.io.IOException;import java.util.Arrays;import java.util.regex.Pattern;import bluechip.io.SerializeUtils;import bluechip.io.file.AbstractFileProcessor;import bluechip.io.file.FileProcessor;public class IndexTest {/** * @param args */public static void main(String[] args) throws Exception {//统计一下运行时间long time = System.currentTimeMillis();File file = new File("d:/index.dat");Index data = null;try {//到从文件读取序列化对象data = SerializeUtils.readObject(file);} catch (Exception ex) {final Index index = new Index(4000);final Pattern pattern = Pattern.compile("\\s+");//简单的分词FileProcessor fp = new AbstractFileProcessor(new File("D:/英文版世界名著[下]/罪与罚.txt")) {@Overrideprotected void processLine(String line) throws IOException {String[] words = pattern.split(line);//一行一条记录index.setId(words, this.getLineNumber());}};fp.process();data = index;//序列化存储到文件SerializeUtils.writeObject(data, file);}//查找存在下列单词的行号int[] ids = data.getIdSetWithAnd("his", "and", "was", "were", "as", "to");System.out.println(Arrays.toString(ids));System.out.println(ids.length);System.out.println(System.currentTimeMillis() - time);}}