c++ html parse解决方法

2012-03-26

c++ html parse有开源的c/c++的html解析器么？简单、能处理不规范的html即可。[解决办法]这个可以用正则表达

c++ html parse
有开源的c/c++的html解析器么？简单、能处理不规范的html即可。

[解决办法]
这个可以用正则表达式来做。下面是从boost库里的正则例子，当然你也可以用微软的正则库
The following example takes C/C++ source code as input, and outputs syntax highlighted HTML code.

#include <fstream>
#include <sstream>
#include <string>
#include <iterator>
#include <boost/regex.hpp>
#include <fstream>
#include <iostream>

// purpose:
// takes the contents of a file and transform to
// syntax highlighted code in html format

boost::regex e1, e2;
extern const char* expression_text;
extern const char* format_string;
extern const char* pre_expression;
extern const char* pre_format;
extern const char* header_text;
extern const char* footer_text;

void load_file(std::string& s, std::istream& is)
{
s.erase();
s.reserve(is.rdbuf()-> in_avail());
char c;
while(is.get(c))
{
if(s.capacity() == s.size())
s.reserve(s.capacity() * 3);
s.append(1, c);
}
}

int main(int argc, const char** argv)
{
try{
e1.assign(expression_text);
e2.assign(pre_expression);
for(int i = 1; i < argc; ++i)
{
std::cout < < "Processing file " < < argv[i] < < std::endl;
std::ifstream fs(argv[i]);
std::string in;
load_file(in, fs);
std::string out_name(std::string(argv[i]) + std::string( ".htm "));
std::ofstream os(out_name.c_str());
os < < header_text;
// strip ' < ' and '> ' first by outputting to a
// temporary string stream
std::ostringstream t(std::ios::out | std::ios::binary);
std::ostream_iterator <char, char> oi(t);
boost::regex_replace(oi, in.begin(), in.end(),
e2, pre_format, boost::match_default | boost::format_all);
// then output to final output stream
// adding syntax highlighting:
std::string s(t.str());
std::ostream_iterator <char, char> out(os);
boost::regex_replace(out, s.begin(), s.end(),
e1, format_string, boost::match_default | boost::format_all);
os < < footer_text;
}
}
catch(...)
{ return -1; }
return 0;
}

extern const char* pre_expression = "( <)|(> )|\\r ";
extern const char* pre_format = "(?1 <)(?2> ) ";

const char* expression_text = // preprocessor directives: index 1
"(^[[:blank:]]*#(?:[^\\\\\\n]|\\\\[^\\n[:punct:][:word:]]*[\\n[:punct:][:word:]])*)| "
// comment: index 2
"(//[^\\n]*|/\\*.*?\\*/)| "
// literals: index 3
"\\ <([+-]?(?:(?:0x[[:xdigit:]]+)|(?:(?:[[:digit:]]*\\.)?[[:digit:]]+(?:[eE][+-]?[[:digit:]]+)?))u?(?:(?:int(?:8|16|32|64))|L)?)\\> | "
// string literals: index 4
"( '(?:[^\\\\ ']|\\\\.)* '|\ "(?:[^\\\\\ "]|\\\\.)*\ ")| "
// keywords: index 5
"\\ <(__asm|__cdecl|__declspec|__export|__far16|__fastcall|__fortran|__import "
"|__pascal|__rtti|__stdcall|_asm|_cdecl|__except|_export|_far16|_fastcall "
"|__finally|_fortran|_import|_pascal|_stdcall|__thread|__try|asm|auto|bool "

热点排行