算法擂台——依据文本产生字典

2011-12-25

算法擂台——根据文本产生字典注意，测试数据已经上传，位于：http://download.csdn.net/source/3257888上一个

算法擂台——根据文本产生字典
注意，测试数据已经上传，位于：http://download.csdn.net/source/3257888
上一个问题是不是有点难，先来一个简单的问题热身下。

考虑这样一个问题，给定一段输入文本，从中提取所有的单词，并且输出，要求输出的单词经过排序，不区分大小写，而且不重复和遗漏。

比如："Hello World! Hello everyone, my name is caozhy!"
输出：
caozhy
everyone
hello
is
my
name
world

下面给一个我的实现：

C# code

    class DictGen    {        class Node        {            public Dictionary<char, Node> Dict { get; set; }            public Node() { Dict = new Dictionary<char, Node>(); }            public IEnumerable<string> Get()            {                foreach (var item in Dict.OrderBy(x => x.Key))                {                    if (item.Key == '\0') yield return "";                    foreach (var item1 in item.Value.Get())                    {                        yield return item.Key.ToString().ToLower() + item1;                    }                }            }        }        public static IEnumerable<string> ParseString(TextReader tr)        {            Node rootNode = new Node();            Node currNode = rootNode;            int state = 0;            char[] arr;            try            {                while (true)                {                    arr = tr.ReadLine().ToUpper().ToCharArray();                    for (int i = 0; i < arr.GetLength(0); i++)                    {                        if (arr[i] >= 65 && arr[i] <= 90)                        {                            state = 1;                            if (!currNode.Dict.ContainsKey(arr[i]))                                currNode.Dict.Add(arr[i], new Node());                            currNode = currNode.Dict[arr[i]];                        }                        else                        {                            if (state == 1)                            {                                if (!currNode.Dict.ContainsKey('\0'))                                    currNode.Dict.Add('\0', new Node());                                state = 0;                            }                            currNode = rootNode;                        }                    }                }            }            catch            { }            return rootNode.Get();        }    }

[解决办法]
mark,向MVP学习
[解决办法]
最好能给个区分变异词，同义词的算法
[解决办法]

C# code

using System;using System.Collections.Generic;using System.Linq;using System.Text;using System.Threading;using System.Text.RegularExpressions;namespace ConsoleApplication1{    class Program    {        static void Main(string[] args)        {            foreach (var item in GetStr("Hello World! Hello everyone, my name is caozhy!"))            {                Console.WriteLine(item);            }        }        static IEnumerable<string> GetStr(string str)        {            string[] list = str.Split(new char[] { ' ', ',', '!' }, StringSplitOptions.RemoveEmptyEntries);            return list.Distinct().OrderBy(i => i.ToLowerInvariant()).AsEnumerable();        }    }}
[解决办法]
这种问题我一般用c++使用STL的map，直接put，然后使用iterator输出。
O(nlogn)
[解决办法]
C# codeusing System;using System.Collections.Generic;using System.Linq;using System.Text;using System.Threading;using System.Text.RegularExpressions;namespace ConsoleApplication1{    class Program    {        static void Main(string[] args)        {            foreach (var item in GetStr("Hello World! Hello everyone, my name is caozhy!,ni ,hao ,...."))            {                Console.WriteLine(item);            }        }        static IEnumerable<string> GetStr(string str)        {                     Regex re = new Regex(@"[a-zA-Z]+");            List<string> list = new List<string>();            MatchCollection mc = re.Matches(str);            foreach (Match item in mc)            {                list.Add(item.Value);            }            return list.Distinct().OrderBy(i => i.ToLowerInvariant()).AsEnumerable();        }    }} 
 
[解决办法]
I'm
aren't
这种算一个词还是两个词？
[解决办法]
手上恰好没有很长的英文 电子版的书。。
试了几个短的，没发现什么问题
[解决办法]
探讨

心理視覺模式聽覺模型

[解决办法]
C/C++ code#include <iostream>#include <map>#include <string>using namespace std;int main(){    map<string,bool> result;    string s = "Hello World! Hello everyone, my name is caozhy!";    int p=0;    string word;    while(p<s.size()){        char now = tolower(s[p]);        if(now>='a' && now<='z')            word += now;        else{            result[word]=true;            word="";        }        p++;    }    for(map<string,bool>::iterator iter=result.begin();iter != result.end();iter++)    {        cout<<iter->first<<endl;    }}
[解决办法]
不好意思。。没注意是C#专区……
[解决办法]
C# code static IEnumerable<string> GetStr(string str)        {            Regex re = new Regex(@"[a-zA-Z]+");            List<string> list = new List<string>();            MatchCollection mc = re.Matches(str);            foreach (Match item in mc)            {                list.Add(item.Value.ToLowerInvariant());            }            return list.Distinct().OrderBy(i =>i).AsEnumerable();        }
[解决办法]
二叉排序?
[解决办法]
是比你的少些，我看看源文件
[解决办法]
不多的话  贴这里就好
[解决办法]
C# code        static IOrderedEnumerable<string> GetWords(TextReader tr)        {            HashSet<string> hs = new HashSet<string>();            Queue<char> q = new Queue<char>();            int i = tr.Read();            while (i != -1)            {                if (i >= 65 && i <= 90)                {                    i += 32;                    q.Enqueue((char)i);                }                else if (i >= 97 && i <= 122)                {                    q.Enqueue((char)i);                }                else                {                    if (q.Count > 0)                    {                        hs.Add(new string(q.ToArray()));                        q.Clear();                    }                }                i = tr.Read();            }            tr.Close();            return hs.OrderBy(s => s);        }
[解决办法]
换SortedList试试：
C# code        static void Main(string[] args)        {            TextReader tr = new StringReader("Hello World! Hello everyone, my name is caozhy!");            var hs = GetWords(tr);            foreach (string s in hs.Keys)                Console.WriteLine(s);        }        static SortedList<string, bool> GetWords(TextReader tr)        {            SortedList<string, bool> sl = new SortedList<string, bool>();            Queue<char> q = new Queue<char>();            int i = tr.Read();            while (i != -1)            {                if (i >= 65 && i <= 90)                {                    i += 32;                    q.Enqueue((char)i);                }                else if (i >= 97 && i <= 122)                {                    q.Enqueue((char)i);                }                else                {                    if (q.Count > 0)                    {                        string str = new string(q.ToArray());                        if (!sl.ContainsKey(str))                            sl.Add(str, true);                        q.Clear();                    }                }                i = tr.Read();            }            tr.Close();            return sl;        } 
 
[解决办法]
数据传上去了吗？
我看我的为什么少了
[解决办法]
C# code class Program    {        static void Main(string[] args)        {            string str = File.ReadAllText(@"C:\123.txt");            DateTime dt = DateTime.Now;            char[] buffer = new char[64];//假设没有超过64个字符的单词            int index = 0;            char c;            BST bst = new BST();            for (int i = 0; i < str.Length; i++)            {                c = (char)(str[i] | 0x20);                if (c >= 97 && c <= 122)                {                    buffer[index++] = c;                }                else if (index > 0)                {                    bst.Insert(new string(buffer,0,index));                    index = 0;                }            }                  bst.Root.Output();         }           }    public class BSTNode    {        public string Value { get; set; }        public BSTNode Left { get; set; }        public BSTNode Right { get; set; }        public BSTNode(string value)        {            this.Value = value;        }        public void InsertNode(BSTNode node)        {            if (string.Compare(this.Value, node.Value) < 0)            {                if (this.Right == null)                    this.Right = node;                else                    this.Right.InsertNode(node);            }            else if (string.Compare(this.Value, node.Value) > 0)            {                if (this.Left == null)                    this.Left = node;                else                    this.Left.InsertNode(node);            }        }        public void Output()        {            if (this.Left != null)                this.Left.Output();            Console.WriteLine(this.Value);            if (this.Right != null)                this.Right.Output();        }    }    public class BST    {        public BSTNode Root { get; set; }        public void Insert(string value)        {            BSTNode node = new BSTNode(value);            if (Root == null)                Root = node;            else                Root.InsertNode(node);        }          }
[解决办法]
直接用trie行不

用初始状态表示空串或者一个非法串。
在初始状态下读入一个非字母，则还是初始状态。
读一个字母则按trie的规则进入下一个状态。
在非初始状态下读入一个非字母，则标记这个状态为有单词，然后进入初始状态。

也就是说在trie的基础上稍微变形。

最后维护一个当前单词，dfs这个trie，dfs的时候注意一下先搞ascii靠前的，遇到了一个有单词的状态就可以输出了。

这样，时间复杂度就是输入串的长度，总共只扫描一次。
空间复杂度相对有点大。
[解决办法]
探讨
我的大约 1.0 ~ 1.2 sec. 你的大约 0.75 ~ 0.9 sec.

不过能接近库的效率我已经很满足了。。。

[解决办法]
不过从结果来看，我的结果与LCL是一样的，比楼主的少了一部分
[解决办法]
探讨
最好能给个区分变异词，同义词的算法

[解决办法]
不用Queue<char>，直接用char数组，百度了一下，目前除了“色氨酸合成酶A蛋白质”这个变态的包括1913个字母的单词之外，就数“矽肺病”这个单词最长了，有45个字母：
C# code        static IOrderedEnumerable<string> GetWords(TextReader tr)        {            HashSet<string> hs = new HashSet<string>();            char[] cs = new char[45];            int i = tr.Read();            int index = 0;            int length = 0;            while (i != -1)            {                if (i >= 65 && i <= 90)                {                    cs[index++] = (char)(i + 32);                    length++;                }                else if (i >= 97 && i <= 122)                {                    cs[index++] = (char)i;                    length++;                }                else                {                    if (length > 0)                    {                        hs.Add(new string(cs, 0, length));                        index = 0;                        length = 0;                    }                }                i = tr.Read();            }            tr.Close();            return hs.OrderBy(s => s);        } 
 
[解决办法]
原来是那么简单的问题，我还以为考虑中文，词组，原来仅仅是考虑英文单词！一个循环，再来个排序就搞定了！
[解决办法]
感觉先把单词分开，然后存入STL的map或者set里面，然后用迭代器输出应该可以搞定，不过这样效率都取决于容器的效率的，就背离LZ的意思了
[解决办法]
C# code凑个热闹using System;using System.Collections.Generic;using System.Linq;using System.Text;using System.Text.RegularExpressions;namespace Arithmetic{    class Program    {        static List<string> ListWords = new List<string>();         static void Main(string[] args)        {            string str = "Hello World Hello everyone my name is caozhy";            GetWords(str);            ListWords.ForEach(delegate(string word)            {                Console.WriteLine(word);            });        }        public static void GetWords(string text)        {            string strWords = text.ToLower();            Regex regexWord = new Regex("[a-z]+");            MatchCollection mathCollection= regexWord.Matches(strWords);            foreach (Match m in mathCollection)            {                ListWords.Add(m.Value);            }        }    }}
[解决办法]
算法问题…………自愧不如
[解决办法]
探讨

        private void button1_Click(object sender, EventArgs e)
        {
            long l = DateTime.Now.Ticks;

            StreamReader objReader = new StreamReader("c:\\1.txt");
           ……

[解决办法]
C# code        MatchCollection matches = Regex.Matches(s, @"[a-zA-Z]+");        List<string> words = new List<string>();        for (int i = 0; i < matches.Count; i++)            words.Add(matches[i].Value);        words.Sort();        for (int i = 1; i < words.Count; i++)        {            if (words[i] == words[i - 1])            {                words.RemoveAt(i--);                continue;            }            if (char.IsUpper(words[i][0]) && words[i].ToLower() == words[i - 1])                words.RemoveAt(i--);        }
[解决办法]
发现楼主得到的结果，有很多源文本里没有，比如
aboutmore
aboutwhat
……
类似的还有很多。
[解决办法]
探讨
引用:
C# code

凑个热闹
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;

namespace Arithmetic
{
class ……

[解决办法]
经测试，楼主（caozhy ）和3楼（LCL_data）的算法，楼主更快。
Windows XP3 32位笔记本，不考虑界面显示，
caozhy 耗时为 165±10 毫秒  
LCL_data 耗时为 220±10 毫秒

27楼的测试结果有一定问题，因为 caozhy 算法中使用了 tr.ReadLine() ，为公平起见，这句话应更改为 tr.ReadToEnd()，并去掉 while(true) 。
[解决办法]
33楼 (jiabiao113) 的结果为：
9880±50 毫秒
[解决办法]
楼主给出的单词好像总是多一些。
C# codepublic static IEnumerable<string> ParseString1(TextReader tr){    return ParseString2(tr).Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).Distinct().OrderBy(i => i);}public static string ParseString2(TextReader tr){    var s = tr.ReadToEnd().ToCharArray();    for (var x = 0; x < s.Length; x++)        if (s[x] >= 65 && s[x] <= 90)            s[x] += (char)32;        else if (s[x] < 97 || s[x] > 122)            s[x] = ' ';    return new string(s);} 
 
[解决办法]
版主推荐区进来的
不会用c#
我用python写一个行不行
import string

def stat(strin):
   maketrans = strin.maketrans(string.punctuation, ' '*len(string.punctuation))
   strin = strin.translate(maketrans)
   strin = strin.strip()
   word = strin.strip()
   word.sort()
   for i in word:
       print(i)
[解决办法]
版主推荐区进来的
不会用c#
我用python写一个行不行
import string

def stat(strin):
   maketrans = strin.maketrans(string.punctuation, ' '*len(string.punctuation))
   strin = strin.translate(maketrans)
   strin = strin.strip()
   word = strin.strip()
   word.sort()
   for i in word:
       print(i)
[解决办法]
排序,输出,
输出的时候检查当前词汇.如果是老单词就不输出.
[解决办法]
trie tree
[解决办法]
C# code//不大熟悉C#参考了一些楼上的代码，好不容易拼出来的程序。//计时不包括控制台的输出，否则把注释掉的打开就行//to 11楼我的结果和lcl_data的结果，在你发出来的部分是一样的using System;using System.Collections.Generic;using System.Linq;using System.Text;using System.IO;using System.Windows.Forms;namespace ConsoleApplication1{    class Program    {        public class Node        {            public Node[] cld;            public bool have;            public Node()            {                cld = new Node[26];                for (int i = 0; i < 26; ++i)                cld[i] = null;                have = false;            }            public Node forward(int x)            {                if (cld[x] != null) return cld[x];                return cld[x] = new Node();            }        }        public void dfs(Node now, string build)        {            if (now.have == true) System.Console.WriteLine(build);            for (int i = 0; i < 26; ++i) if (now.cld[i] != null)            {                dfs(now.cld[i], string.Copy(build) + (char)(i+'a'));            }        }        public void parse()        {            StreamReader objReader = new StreamReader("D:\\source.txt");            string s = objReader.ReadToEnd();            objReader.Close();            Node root = new Node();            Node curr = root;            for (int i = 0; i < s.Length; ++i)            {                int who = s[i];                if (who >= 'A' && who <= 'Z') who |= 32;                if (who < 'a' || who > 'z')                {                    if (curr != root)                    {                        curr.have = true;                        curr = root;                    }                }                else                {                    curr = curr.forward(who-'a');                }            }            //dfs(root, "");        }         static void Main(string[] args)        {            long l = DateTime.Now.Ticks;            (new Program()).parse();            TimeSpan ts = new TimeSpan(DateTime.Now.Ticks - l);            MessageBox.Show(ts.ToString());                     }    }}
[解决办法]
5006个应该是对的。
还有5000个字符相连用StringBuilder速度要快很多
[解决办法]
C# code//重新放在ArrayList里了using System;using System.Collections;using System.Collections.Generic;using System.Linq;using System.Text;using System.IO;using System.Windows.Forms;namespace ConsoleApplication1{    class Program    {        public class Node        {            public Node[] cld;            public bool have;            public Node()            {                cld = new Node[26];                for (int i = 0; i < 26; ++i)                cld[i] = null;                have = false;            }            public Node forward(int x)            {                if (cld[x] != null) return cld[x];                return cld[x] = new Node();            }        }        public void dfs(Node now, string build, ArrayList ret)        {            if (now.have == true) ret.Add(build);            for (int i = 0; i < 26; ++i) if (now.cld[i] != null)            {                dfs(now.cld[i], string.Copy(build) + (char)(i+'a'), ret);            }        }        public void parse()        {            StreamReader objReader = new StreamReader("D:\\source.txt");            string s = objReader.ReadToEnd();            objReader.Close();            Node root = new Node();            Node curr = root;            for (int i = 0; i < s.Length; ++i)            {                int who = s[i];                if (who >= 'A' && who <= 'Z') who |= 32;                if (who < 'a' || who > 'z')                {                    if (curr != root)                    {                        curr.have = true;                        curr = root;                    }                }                else                {                    curr = curr.forward(who-'a');                }            }            ArrayList ans = new ArrayList();            dfs(root, "", ans);            System.Console.Write("{0} elements have been found!\n", ans.Count);            //foreach(var x in ans) System.Console.WriteLine(x);        }         static void Main(string[] args)        {            long l = DateTime.Now.Ticks;            (new Program()).parse();            TimeSpan ts = new TimeSpan(DateTime.Now.Ticks - l);            MessageBox.Show(ts.ToString());                     }    }} 
 
[解决办法]
探讨
引用:
C# code

//重新放在ArrayList里了
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using Syste……

[解决办法]
楼主给的测试文件是utf8的 很多字符超过0x80
楼主给的结果很多单词是测试文件没有的

[解决办法]
研究下书本的单词 首字母服从什么分布？  不知道是不是正态分布。。

根据具体的分布情况自己重新写一个排序算法出来，系统自带的都是通用型的 要么快排等。。

根据分布 改写排序算法 这个是我的建议
[解决办法]
手上恰好没有很长的英文 电子版的书。。

热点排行