算法擂台——根据文本产生字典
注意,测试数据已经上传,位于:http://download.csdn.net/source/3257888
上一个问题是不是有点难,先来一个简单的问题热身下。
考虑这样一个问题,给定一段输入文本,从中提取所有的单词,并且输出,要求输出的单词经过排序,不区分大小写,而且不重复和遗漏。
比如:"Hello World! Hello everyone, my name is caozhy!"
输出:
caozhy
everyone
hello
is
my
name
world
下面给一个我的实现:
class DictGen { class Node { public Dictionary<char, Node> Dict { get; set; } public Node() { Dict = new Dictionary<char, Node>(); } public IEnumerable<string> Get() { foreach (var item in Dict.OrderBy(x => x.Key)) { if (item.Key == '\0') yield return ""; foreach (var item1 in item.Value.Get()) { yield return item.Key.ToString().ToLower() + item1; } } } } public static IEnumerable<string> ParseString(TextReader tr) { Node rootNode = new Node(); Node currNode = rootNode; int state = 0; char[] arr; try { while (true) { arr = tr.ReadLine().ToUpper().ToCharArray(); for (int i = 0; i < arr.GetLength(0); i++) { if (arr[i] >= 65 && arr[i] <= 90) { state = 1; if (!currNode.Dict.ContainsKey(arr[i])) currNode.Dict.Add(arr[i], new Node()); currNode = currNode.Dict[arr[i]]; } else { if (state == 1) { if (!currNode.Dict.ContainsKey('\0')) currNode.Dict.Add('\0', new Node()); state = 0; } currNode = rootNode; } } } } catch { } return rootNode.Get(); } }
using System;using System.Collections.Generic;using System.Linq;using System.Text;using System.Threading;using System.Text.RegularExpressions;namespace ConsoleApplication1{ class Program { static void Main(string[] args) { foreach (var item in GetStr("Hello World! Hello everyone, my name is caozhy!")) { Console.WriteLine(item); } } static IEnumerable<string> GetStr(string str) { string[] list = str.Split(new char[] { ' ', ',', '!' }, StringSplitOptions.RemoveEmptyEntries); return list.Distinct().OrderBy(i => i.ToLowerInvariant()).AsEnumerable(); } }}
[解决办法]
这种问题我一般用c++使用STL的map,直接put,然后使用iterator输出。
O(nlogn)
[解决办法]
using System;using System.Collections.Generic;using System.Linq;using System.Text;using System.Threading;using System.Text.RegularExpressions;namespace ConsoleApplication1{ class Program { static void Main(string[] args) { foreach (var item in GetStr("Hello World! Hello everyone, my name is caozhy!,ni ,hao ,....")) { Console.WriteLine(item); } } static IEnumerable<string> GetStr(string str) { Regex re = new Regex(@"[a-zA-Z]+"); List<string> list = new List<string>(); MatchCollection mc = re.Matches(str); foreach (Match item in mc) { list.Add(item.Value); } return list.Distinct().OrderBy(i => i.ToLowerInvariant()).AsEnumerable(); } }}
[解决办法]
I'm
aren't
这种算一个词还是两个词?
[解决办法]
手上恰好没有很长的英文 电子版的书。。
试了几个短的,没发现什么问题
[解决办法]
#include <iostream>#include <map>#include <string>using namespace std;int main(){ map<string,bool> result; string s = "Hello World! Hello everyone, my name is caozhy!"; int p=0; string word; while(p<s.size()){ char now = tolower(s[p]); if(now>='a' && now<='z') word += now; else{ result[word]=true; word=""; } p++; } for(map<string,bool>::iterator iter=result.begin();iter != result.end();iter++) { cout<<iter->first<<endl; }}
[解决办法]
不好意思。。没注意是C#专区……
[解决办法]
static IEnumerable<string> GetStr(string str) { Regex re = new Regex(@"[a-zA-Z]+"); List<string> list = new List<string>(); MatchCollection mc = re.Matches(str); foreach (Match item in mc) { list.Add(item.Value.ToLowerInvariant()); } return list.Distinct().OrderBy(i =>i).AsEnumerable(); }
[解决办法]
二叉排序?
[解决办法]
是比你的少些,我看看源文件
[解决办法]
不多的话 贴这里就好
[解决办法]
static IOrderedEnumerable<string> GetWords(TextReader tr) { HashSet<string> hs = new HashSet<string>(); Queue<char> q = new Queue<char>(); int i = tr.Read(); while (i != -1) { if (i >= 65 && i <= 90) { i += 32; q.Enqueue((char)i); } else if (i >= 97 && i <= 122) { q.Enqueue((char)i); } else { if (q.Count > 0) { hs.Add(new string(q.ToArray())); q.Clear(); } } i = tr.Read(); } tr.Close(); return hs.OrderBy(s => s); }
[解决办法]
换SortedList试试:
static void Main(string[] args) { TextReader tr = new StringReader("Hello World! Hello everyone, my name is caozhy!"); var hs = GetWords(tr); foreach (string s in hs.Keys) Console.WriteLine(s); } static SortedList<string, bool> GetWords(TextReader tr) { SortedList<string, bool> sl = new SortedList<string, bool>(); Queue<char> q = new Queue<char>(); int i = tr.Read(); while (i != -1) { if (i >= 65 && i <= 90) { i += 32; q.Enqueue((char)i); } else if (i >= 97 && i <= 122) { q.Enqueue((char)i); } else { if (q.Count > 0) { string str = new string(q.ToArray()); if (!sl.ContainsKey(str)) sl.Add(str, true); q.Clear(); } } i = tr.Read(); } tr.Close(); return sl; }
[解决办法]
数据传上去了吗?
我看我的为什么少了
[解决办法]
class Program { static void Main(string[] args) { string str = File.ReadAllText(@"C:\123.txt"); DateTime dt = DateTime.Now; char[] buffer = new char[64];//假设没有超过64个字符的单词 int index = 0; char c; BST bst = new BST(); for (int i = 0; i < str.Length; i++) { c = (char)(str[i] | 0x20); if (c >= 97 && c <= 122) { buffer[index++] = c; } else if (index > 0) { bst.Insert(new string(buffer,0,index)); index = 0; } } bst.Root.Output(); } } public class BSTNode { public string Value { get; set; } public BSTNode Left { get; set; } public BSTNode Right { get; set; } public BSTNode(string value) { this.Value = value; } public void InsertNode(BSTNode node) { if (string.Compare(this.Value, node.Value) < 0) { if (this.Right == null) this.Right = node; else this.Right.InsertNode(node); } else if (string.Compare(this.Value, node.Value) > 0) { if (this.Left == null) this.Left = node; else this.Left.InsertNode(node); } } public void Output() { if (this.Left != null) this.Left.Output(); Console.WriteLine(this.Value); if (this.Right != null) this.Right.Output(); } } public class BST { public BSTNode Root { get; set; } public void Insert(string value) { BSTNode node = new BSTNode(value); if (Root == null) Root = node; else Root.InsertNode(node); } }
[解决办法]
直接用trie行不
用初始状态表示空串或者一个非法串。
在初始状态下读入一个非字母,则还是初始状态。
读一个字母则按trie的规则进入下一个状态。
在非初始状态下读入一个非字母,则标记这个状态为有单词,然后进入初始状态。
也就是说在trie的基础上稍微变形。
最后维护一个当前单词,dfs这个trie,dfs的时候注意一下先搞ascii靠前的,遇到了一个有单词的状态就可以输出了。
这样,时间复杂度就是输入串的长度,总共只扫描一次。
空间复杂度相对有点大。
[解决办法]
static IOrderedEnumerable<string> GetWords(TextReader tr) { HashSet<string> hs = new HashSet<string>(); char[] cs = new char[45]; int i = tr.Read(); int index = 0; int length = 0; while (i != -1) { if (i >= 65 && i <= 90) { cs[index++] = (char)(i + 32); length++; } else if (i >= 97 && i <= 122) { cs[index++] = (char)i; length++; } else { if (length > 0) { hs.Add(new string(cs, 0, length)); index = 0; length = 0; } } i = tr.Read(); } tr.Close(); return hs.OrderBy(s => s); }
[解决办法]
原来是那么简单的问题,我还以为考虑中文,词组,原来仅仅是考虑英文单词!一个循环,再来个排序就搞定了!
[解决办法]
感觉先把单词分开,然后存入STL的map或者set里面,然后用迭代器输出应该可以搞定,不过这样效率都取决于容器的效率的,就背离LZ的意思了
[解决办法]
凑个热闹using System;using System.Collections.Generic;using System.Linq;using System.Text;using System.Text.RegularExpressions;namespace Arithmetic{ class Program { static List<string> ListWords = new List<string>(); static void Main(string[] args) { string str = "Hello World Hello everyone my name is caozhy"; GetWords(str); ListWords.ForEach(delegate(string word) { Console.WriteLine(word); }); } public static void GetWords(string text) { string strWords = text.ToLower(); Regex regexWord = new Regex("[a-z]+"); MatchCollection mathCollection= regexWord.Matches(strWords); foreach (Match m in mathCollection) { ListWords.Add(m.Value); } } }}
[解决办法]
算法问题…………自愧不如
[解决办法]
MatchCollection matches = Regex.Matches(s, @"[a-zA-Z]+"); List<string> words = new List<string>(); for (int i = 0; i < matches.Count; i++) words.Add(matches[i].Value); words.Sort(); for (int i = 1; i < words.Count; i++) { if (words[i] == words[i - 1]) { words.RemoveAt(i--); continue; } if (char.IsUpper(words[i][0]) && words[i].ToLower() == words[i - 1]) words.RemoveAt(i--); }
[解决办法]
发现楼主得到的结果,有很多源文本里没有,比如
aboutmore
aboutwhat
……
类似的还有很多。
[解决办法]
public static IEnumerable<string> ParseString1(TextReader tr){ return ParseString2(tr).Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).Distinct().OrderBy(i => i);}public static string ParseString2(TextReader tr){ var s = tr.ReadToEnd().ToCharArray(); for (var x = 0; x < s.Length; x++) if (s[x] >= 65 && s[x] <= 90) s[x] += (char)32; else if (s[x] < 97 || s[x] > 122) s[x] = ' '; return new string(s);}
[解决办法]
版主推荐区进来的
不会用c#
我用python写一个行不行
import string
def stat(strin):
maketrans = strin.maketrans(string.punctuation, ' '*len(string.punctuation))
strin = strin.translate(maketrans)
strin = strin.strip()
word = strin.strip()
word.sort()
for i in word:
print(i)
[解决办法]
版主推荐区进来的
不会用c#
我用python写一个行不行
import string
def stat(strin):
maketrans = strin.maketrans(string.punctuation, ' '*len(string.punctuation))
strin = strin.translate(maketrans)
strin = strin.strip()
word = strin.strip()
word.sort()
for i in word:
print(i)
[解决办法]
排序,输出,
输出的时候检查当前词汇.如果是老单词就不输出.
[解决办法]
trie tree
[解决办法]
//不大熟悉C#参考了一些楼上的代码,好不容易拼出来的程序。//计时不包括控制台的输出,否则把注释掉的打开就行//to 11楼我的结果和lcl_data的结果,在你发出来的部分是一样的using System;using System.Collections.Generic;using System.Linq;using System.Text;using System.IO;using System.Windows.Forms;namespace ConsoleApplication1{ class Program { public class Node { public Node[] cld; public bool have; public Node() { cld = new Node[26]; for (int i = 0; i < 26; ++i) cld[i] = null; have = false; } public Node forward(int x) { if (cld[x] != null) return cld[x]; return cld[x] = new Node(); } } public void dfs(Node now, string build) { if (now.have == true) System.Console.WriteLine(build); for (int i = 0; i < 26; ++i) if (now.cld[i] != null) { dfs(now.cld[i], string.Copy(build) + (char)(i+'a')); } } public void parse() { StreamReader objReader = new StreamReader("D:\\source.txt"); string s = objReader.ReadToEnd(); objReader.Close(); Node root = new Node(); Node curr = root; for (int i = 0; i < s.Length; ++i) { int who = s[i]; if (who >= 'A' && who <= 'Z') who |= 32; if (who < 'a' || who > 'z') { if (curr != root) { curr.have = true; curr = root; } } else { curr = curr.forward(who-'a'); } } //dfs(root, ""); } static void Main(string[] args) { long l = DateTime.Now.Ticks; (new Program()).parse(); TimeSpan ts = new TimeSpan(DateTime.Now.Ticks - l); MessageBox.Show(ts.ToString()); } }}
[解决办法]
5006个应该是对的。
还有5000个字符相连用StringBuilder速度要快很多
[解决办法]
//重新放在ArrayList里了using System;using System.Collections;using System.Collections.Generic;using System.Linq;using System.Text;using System.IO;using System.Windows.Forms;namespace ConsoleApplication1{ class Program { public class Node { public Node[] cld; public bool have; public Node() { cld = new Node[26]; for (int i = 0; i < 26; ++i) cld[i] = null; have = false; } public Node forward(int x) { if (cld[x] != null) return cld[x]; return cld[x] = new Node(); } } public void dfs(Node now, string build, ArrayList ret) { if (now.have == true) ret.Add(build); for (int i = 0; i < 26; ++i) if (now.cld[i] != null) { dfs(now.cld[i], string.Copy(build) + (char)(i+'a'), ret); } } public void parse() { StreamReader objReader = new StreamReader("D:\\source.txt"); string s = objReader.ReadToEnd(); objReader.Close(); Node root = new Node(); Node curr = root; for (int i = 0; i < s.Length; ++i) { int who = s[i]; if (who >= 'A' && who <= 'Z') who |= 32; if (who < 'a' || who > 'z') { if (curr != root) { curr.have = true; curr = root; } } else { curr = curr.forward(who-'a'); } } ArrayList ans = new ArrayList(); dfs(root, "", ans); System.Console.Write("{0} elements have been found!\n", ans.Count); //foreach(var x in ans) System.Console.WriteLine(x); } static void Main(string[] args) { long l = DateTime.Now.Ticks; (new Program()).parse(); TimeSpan ts = new TimeSpan(DateTime.Now.Ticks - l); MessageBox.Show(ts.ToString()); } }}
[解决办法]