注册 登录  
 加关注
   显示下一条  |  关闭
温馨提示!由于新浪微博认证机制调整,您的新浪微博帐号绑定已过期,请重新绑定!立即重新绑定新浪微博》  |  关闭

gmd20的个人空间

// 编程和生活

 
 
 

日志

 
 

xml的escape string字符转换优化和GNU perfect hash function generator(gperf)  

2013-10-18 13:47:15|  分类: 程序设计 |  标签: |举报 |字号 订阅

  下载LOFTER 我的照片书  |
html和xml里面是有类似这样的escape string的编码转换的。
比如xml的
/**
* & --> &
* < --> &lt;
* > --> &gt;
* " --> &quot;
* ' --> &apos;
*/

这个转换,很容易成为程序的性能热点,现在的一个模块就是这样。
之前刚好看到类似问题相关的文章,escape string的转换相关函数优化之后,响应时间好很多。
github为了解决这个转换问题而建立的开源库houdini  https://github.com/vmg/houdini/

看了一下代码,还是encode的转换,主要是通过lookup table的查表方式来优化的。decode是使用gperf来生成 “完美哈希函数”来实现。 可以参考一下代码的实现,不错!

GNU gperf is a perfect hash function generator
这个可以为指定的预知字符串列表生成所谓的“完美哈希函数”,自动生成查找函数,不会有冲突键的存在。第一次了解。在某些场合下面使用应该是挺合适的。 
下面这个文章有点介绍

就是根据 https://github.com/vmg/houdini/blob/master/html_unescape.gperf 这样的配置,自动生成c函数代码了,和flex和bison那些有点类似用法。

我自己做一下小小的修改:



class small_buffer
{
public:
small_buffer():size_(0),extra_buf_((char*) &data_)
{

}
~small_buffer()
{
if(extra_buf_ != (char *) &data_) {
delete [] extra_buf_;
}
}
void buf_grow(unsigned int size)
{
if (size > 1024) {
extra_buf_ = new char[size];
}
}
void put(const void * b, unsigned int len)
{
memcpy(extra_buf_ + size_, b, len);
size_ += len;
}
void putc(char c)
{
extra_buf_[size_++] = c;
}
void puts(const char * string)
{
put(string, strlen(string));
}
char * data()
{
return extra_buf_;
}
unsigned int size()
{
return size_;
}
private:
char data_[1024];
unsigned int size_;
char *extra_buf_;
};



bool
unescape_xml(small_buffer &ob, const char *src, unsigned int size)
{
unsigned int i = 0;
unsigned int start = 0;
while (i < size) {
if (src[i] != '&') {
i++;
continue;
} else {
goto REAL_WORK;
}
}
return false;

REAL_WORK:
ob.buf_grow(size);

MATCH_LOOP:

char code = 0;
unsigned int len = 0;
do {
if (i + 3 < size) {
unsigned int key = *(unsigned int *) &src[i];
if (key == ('&' | 'l' << 8 | 't' << 16 | ';' << 24)) {
code = '<';
len = 4;
break;
}
if (key == ('&' | 'g' << 8 | 't' << 16 | ';' << 24)) {
code = '>';
len = 4;
break;
}
} else {
break;
}

if (i + 4 < size) {
unsigned int key = *(unsigned int *) &src[i+1];
if (key == ('a' | 'm' << 8 | 'p' << 16 | ';' << 24) ) {
code = '&';
len = 5;
break;
}
} else {
break;
}

if (i + 5 < size && src[i+5] == ';') {
unsigned int key = *(unsigned int *) &src[i+1];
if (key == ('q' | 'u' << 8 | 'o' << 16 | 't' << 24)) {
code = '"';
len = 6;
break;
}
if (key == ('a' | 'p' << 8 | 'o' << 16 | 's' << 24)) {
code = '\'';
len = 6;
break;
}
} else {
break;
}
} while (0);

if (code) {
if (i > start) {
ob.put(&src[start], i - start);
}
ob.putc(code);
i += len;
start = i;
} else {
i++;
}

while (i < size) {
if (src[i] != '&') {
i++;
continue;
} else {
goto MATCH_LOOP;
}
}

if (i > start) {
ob.put(&src[start], i - start);
}

return true;
}


/**
* & --> &amp;
* < --> &lt;
* > --> &gt;
* " --> &quot;
* ' --> &apos;
*/
static const char *LOOKUP_CODES[] = {
"", /* reserved: use literal single character */
"", /* unused */
"", /* reserved: 2 character UTF-8 */
"", /* reserved: 3 character UTF-8 */
"", /* reserved: 4 character UTF-8 */
"?", /* invalid UTF-8 character */
"&quot;",
"&amp;",
"&apos;",
"&lt;",
"&gt;"
};

static const unsigned char CODE_INVALID = 5;

static const unsigned char XML_LOOKUP_TABLE[] = {
/* ASCII: 0xxxxxxx */
5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 5, 5, 0, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
0, 0, 6, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0,10, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

/* Invalid UTF-8 char start: 10xxxxxx */
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,

/* Multibyte UTF-8 */

/* 2 bytes: 110xxxxx */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

/* 3 bytes: 1110xxxx */
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

/* 4 bytes: 11110xxx */
4, 4, 4, 4, 4, 4, 4, 4,

/* Invalid UTF-8: 11111xxx */
5, 5, 5, 5, 5, 5, 5, 5,
};



// ---------------------------------------
// if escape string was found in the sr buffer, ob will be set and return true
// else ob will not set and return false
//----------------------------------------
bool
escape_xml(small_buffer &ob, const unsigned char *src, unsigned int size)
{
unsigned int i = 0;
unsigned char code = 0;

// most of the time no replacement is needed, optimize for the fast path
while (i < size) {
unsigned int byte;
byte = src[i++];
code = XML_LOOKUP_TABLE[byte];
if (code) {
goto REAL_WORK;
}
}
return false;

REAL_WORK:
i--;
ob.buf_grow(size * 8);
if (i > 0) {
ob.put(src,i);
}

while (i < size) {
unsigned int start, end;
start = end = i;

while (i < size) {
unsigned int byte;

byte = src[i++];
code = XML_LOOKUP_TABLE[byte];

if (!code) {
/* single character used literally */
} else if (code >= CODE_INVALID) {
break; /* insert lookup code string */
} else if (code > size - end) {
code = CODE_INVALID; /* truncated UTF-8 character */
break;
} else {
unsigned int chr = byte & (0xff >> code);

while (--code) {
byte = src[i++];
if ((byte & 0xc0) != 0x80) {
code = CODE_INVALID;
break;
}
chr = (chr << 6) + (byte & 0x3f);
}

switch (i - end) {
case 2:
if (chr < 0x80)
code = CODE_INVALID;
break;
case 3:
if (chr < 0x800 ||
(chr > 0xd7ff && chr < 0xe000) ||
chr > 0xfffd)
code = CODE_INVALID;
break;
case 4:
if (chr < 0x10000 || chr > 0x10ffff)
code = CODE_INVALID;
break;
default:
break;
}
if (code == CODE_INVALID)
break;
}

end = i;
}

if (end > start)
ob.put(src + start, end - start);

/* escaping */
if (end >= size)
break;

ob.puts(LOOKUP_CODES[code]);
}

return true;
}


使用例子

int main (int, char**)
{
int a = 1;
restart:
small_buffer buf;
small_buffer buf2;
std::string s;
std::cin >> s;

if (escape_xml(buf, (unsigned char *)s.c_str(),s.size())) {
s.assign(buf.data(),buf.size());
}
std::cout << s << std::endl;

if (unescape_xml(buf2,s.c_str(),s.size())) {
s.assign(buf2.data(),buf2.size());
}
std::cout << s << std::endl;


goto restart;
return 0;

}

  评论这张
 
阅读(603)| 评论(0)
推荐 转载

历史上的今天

评论

<#--最新日志,群博日志--> <#--推荐日志--> <#--引用记录--> <#--博主推荐--> <#--随机阅读--> <#--首页推荐--> <#--历史上的今天--> <#--被推荐日志--> <#--上一篇,下一篇--> <#-- 热度 --> <#-- 网易新闻广告 --> <#--右边模块结构--> <#--评论模块结构--> <#--引用模块结构--> <#--博主发起的投票-->
 
 
 
 
 
 
 
 
 
 
 
 
 
 

页脚

网易公司版权所有 ©1997-2017