非标准的xml解析器的C++实现:三、解析器的初步实现


如同我之前的一篇文章说的那样,我没有支持DTD与命名空间,

当前实现出来的解析器,只能与xmlhttp对比,因为chrome浏览器解析大文档有bug,至于其他人实现的,我就不一一测试了,既然都决定自己实现了,我只选择大公司的代码做对比。

测试文档大小:3M bytes,约90000个节点。

aqx::xdoc :耗时 70-80ms,内存占用30-40M bytes,30和40主要是32位和64位的区别,如果要追求最少的内存占用,还可以更极端一些,解析速度很难再有本质的提升了,后续要完善的支持,也不会影响解析速度。

xmlhttp: 耗时3000-4000 ms,内存占用约 800M bytes。

目前测试过的系统有:

windows vc++ (vs2017) x86 x64

linux centos7 g++ version(9.1.1) x86 x64

windows中支持3种编码格式:utf-8  utf-16 对应操作系统的ascii编码,在简体中文windows中,也就是通常我们说的gb2312了。

然后目前,我不太可能针对实现细节将原理讲清楚,讲真的,C++的可读性真的非常糟糕,但对于这种需求,还是得用它,这种代码,我自己写完看着不难受,但对别人来说很可能是噩梦,同样的道理,我看别人的C++代码,也会困惑,要让看不懂代码的人,也理解实现细节,这是非常不科学的事。。。

好了,废话不说了,上代码:

//xml.hpp
#pragma once

#include 
#include <set>
#include 
#include 
#include 
#include <string>
#include 
#include <string.h>
#if defined(_WIN32) || defined(_WIN64)
//我只支持了windows中的编码转换,所以这两个文件,仅在windows下使用。
#include "tcvt.h"
#include "encode_adaptive.h"
#endif

#pragma warning(push)
#pragma warning(disable:4996)

namespace aqx {

    namespace aqx_internal {

#ifndef __AQX_UTF8_CHAR_LEN
#define __AQX_UTF8_CHAR_LEN
        static unsigned char utf8_char_len[] = {
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
            2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
            3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
            4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
        };
#endif

        //单字节的字符状态值,对应语法常量
        static unsigned short xml_char_syntax[] = {
            0,0,0,0,0,0,0,0,0,8,8,0,0,8,0,0,
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
            8,64,4,0,0,0,0,4,0,0,0,0,0,8208,16,128,
            1552,1552,1552,1552,1552,1552,1552,1552,1552,1552,0,256,1,2048,2,0,
            0,1072,1072,1072,1072,1072,1072,48,48,48,48,48,48,48,48,48,
            48,48,48,48,48,48,48,48,48,48,48,4096,0,0,0,48,
            0,1072,1072,1072,1072,1072,1072,48,48,48,48,48,48,48,48,48,
            48,48,48,48,48,48,48,48,48,48,48,0,0,0,0,0,
        };


        namespace XML_SYNTAX {
            //语法常量定义,渣英语,名称定义凑合看吧。

            static constexpr auto _X_LT{ static_castshort>(0x01) };            // <
            static constexpr auto _X_GT{ static_castshort>(0x02) };            // >
            static constexpr auto _X_STRING{ static_castshort>(0x04) };        // ' "
            static constexpr auto _X_SPACE{ static_castshort>(0x08) };        // \r\n\r空格
            static constexpr auto _X_NAME{ static_castshort>(0x10) };        // A-Z a-z 0-9 _ - .
            static constexpr auto _X_BEGINNAME{ static_castshort>(0x20) };    // A-Z a-z _
            static constexpr auto _X_EXCLAM{ static_castshort>(0x40) };        // !
            static constexpr auto _X_TAGEND{ static_castshort>(0x80) };        // /
            static constexpr auto _X_ESCAPEEND{ static_castshort>(0x100) };    // ;
            static constexpr auto _X_NUMBER{ static_castshort>(0x200) };    // 数字0-9
            static constexpr auto _X_HEX{ static_castshort>(0x400) };        // 16进制0-9 A-F a-f
            static constexpr auto _X_EQUAL{ static_castshort>(0x800) };        // =
            static constexpr auto _X_LB{ static_castshort>(0x1000) };        // [
            static constexpr auto _X_NEGATIVE{ static_castshort>(0x2000) };    // -
            static constexpr auto _X_MULTIBYTE{ static_castshort>(0x4000) };    // 多字节字符
        }

        //保险起见,为了未来考虑,定义一下xml文档的最大长度,时代发展太迅猛,万一我有生之年能用上128bit,到时候也许处理64bit长度的文档就跟我们现在解析小文档一样。
        using xml_size_t = unsigned int;
        static constexpr auto _xnf{ static_cast(-1) };

        //这个结构,用来储存转义符位置,以便于快速替换,备用,暂不实现,因为这关乎性能。
        struct xml_escape_pos { xml_size_t pos, len; };

        template
        class xparser_t;

        //xml文本迭代器的基本模板类
        template
        class xts_t {
        public:
            using Basetype = _Ty;

        protected:
            const _Ty *text;
            xml_size_t size;
            xml_size_t index;
            _Ty c;
            unsigned char cl;
            unsigned short s;
            unsigned short flags;
        };

        //解析错误信息结构
        //解析时不处理行,列问题,有错误发生时后处理,因为,行,列处理,会使解析速度慢差不多一倍。
        struct xerrorpos {
            xml_size_t pos;
            int number;
            std::string information;
            xml_size_t line;
            xml_size_t column;
        };

        //这两个结构用来储存一些字符串常量,实现两种字符串格式的快速引用,这两个结构绑定到三种xts类中
        struct xmultybyte_constvalue {
            static constexpr const char *emp = "";
            static constexpr const char *br_tag = "
"; static constexpr const char *crlf = "\r\n"; static constexpr const char *end_tag_syntax = ""; static constexpr const char *autoend_tag_syntax = "/>"; static constexpr const char *comment_end = "--"; static constexpr const char *cdata_end = "]]>"; }; struct xwidechar_constvalue { static constexpr const wchar_t *emp = L""; static constexpr const wchar_t *br_tag = L"
"; static constexpr const wchar_t *crlf = L"\r\n"; static constexpr const wchar_t *end_tag_syntax = L""; static constexpr const wchar_t *autoend_tag_syntax = L"/>"; static constexpr const wchar_t *comment_end = L"--"; static constexpr const wchar_t *cdata_end = L"]]>"; }; //utf8的文本迭代器,先基于这个来实现 class xts_utf8 : public xts_t<char> { public: using strtype = std::string; static constexpr int _encoding{ 2 }; using constval = xmultybyte_constvalue; //初始化 void init(const char *_Text, xml_size_t _Size) { text = _Text; size = _Size; index = 0; c = text[0]; cl = utf8_char_len[(unsigned char)c]; s = (cl != 1) ? XML_SYNTAX::_X_MULTIBYTE | XML_SYNTAX::_X_BEGINNAME | XML_SYNTAX::_X_NAME : xml_char_syntax[(unsigned char)c]; } //处理下一个字符 void next() { index += cl; c = text[index]; cl = utf8_char_len[(unsigned char)c]; s = (cl != 1) ? XML_SYNTAX::_X_MULTIBYTE | XML_SYNTAX::_X_BEGINNAME | XML_SYNTAX::_X_NAME : xml_char_syntax[(unsigned char)c]; } //向前回退n个字符,目前,只有在根节点之前的处理,有用到这个 void back(xml_size_t len) { index -= len; c = text[index]; cl = utf8_char_len[(unsigned char)c]; s = (cl != 1) ? XML_SYNTAX::_X_MULTIBYTE | XML_SYNTAX::_X_BEGINNAME | XML_SYNTAX::_X_NAME : xml_char_syntax[(unsigned char)c]; } //next,并判断语法 bool next_is_flags() { next(); return (flags & s) != 0; } //next, 并判断下一个字符的值 bool next_is_char(char _Chr) { next(); return _Chr == c; } //解析错误时,用于获取行,列。 void next_donot_syntax() { index += cl; c = text[index]; cl = utf8_char_len[(unsigned char)c]; } //设置允许的语法 void set_flags(unsigned short _Flags) { flags = _Flags; } private: friend class xparser_t; }; //asc的文本迭代器 class xts_asc : public xts_t<char> { public: using strtype = std::string; static constexpr int _encoding{ 0 }; using constval = xmultybyte_constvalue; //初始化 void init(const char *_Text, xml_size_t _Size) { text = _Text; size = _Size; index = 0; c = text[0]; cl = ((unsigned short)c >= 0x80) ? 2 : 1; s = (cl != 1) ? XML_SYNTAX::_X_MULTIBYTE | XML_SYNTAX::_X_BEGINNAME | XML_SYNTAX::_X_NAME : xml_char_syntax[(unsigned char)c]; } //处理下一个字符 void next() { index += cl; c = text[index]; cl = ((unsigned short)c >= 0x80) ? 2 : 1; s = (cl != 1) ? XML_SYNTAX::_X_MULTIBYTE | XML_SYNTAX::_X_BEGINNAME | XML_SYNTAX::_X_NAME : xml_char_syntax[(unsigned char)c]; } //向前回退n个字符,目前,只有在根节点之前的处理,有用到这个 void back(xml_size_t len) { index -= len; c = text[index]; cl = ((unsigned short)c >= 0x80) ? 2 : 1; s = (cl != 1) ? XML_SYNTAX::_X_MULTIBYTE | XML_SYNTAX::_X_BEGINNAME | XML_SYNTAX::_X_NAME : xml_char_syntax[(unsigned char)c]; } //next,并判断语法 bool next_is_flags() { next(); return (flags & s) != 0; } //next, 并判断下一个字符的值 bool next_is_char(char _Chr) { next(); return _Chr == c; } void next_donot_syntax() { index += cl; c = text[index]; cl = ((unsigned short)c >= 0x80) ? 2 : 1; } //设置允许的语法 void set_flags(unsigned short _Flags) { flags = _Flags; } private: friend class xparser_t; }; class xts_utf16 : public xts_t { public: using strtype = std::wstring; static constexpr int _encoding{ 1 }; using constval = xwidechar_constvalue; xts_utf16() { //utf16的字符不是变长的,固定为1 //虽然有4字节的utf16字符,但影响不到最终解析逻辑。 cl = 1; } private: void init(const wchar_t *_Text, xml_size_t _Size) { text = _Text; size = _Size; index = 0; c = text[0]; cl = 1; s = ((unsigned short)c >= 0x80) ? XML_SYNTAX::_X_MULTIBYTE | XML_SYNTAX::_X_BEGINNAME | XML_SYNTAX::_X_NAME : xml_char_syntax[(unsigned char)c]; } //处理下一个字符 void next() { c = text[++index]; s = ((unsigned short)c >= 0x80) ? XML_SYNTAX::_X_MULTIBYTE | XML_SYNTAX::_X_BEGINNAME | XML_SYNTAX::_X_NAME : xml_char_syntax[(unsigned char)c]; } void back(xml_size_t len) { index -= len; c = text[index]; s = ((unsigned short)c >= 0x80) ? XML_SYNTAX::_X_MULTIBYTE | XML_SYNTAX::_X_BEGINNAME | XML_SYNTAX::_X_NAME : xml_char_syntax[(unsigned char)c]; } //next,并判断语法 bool next_is_flags() { next(); return (flags & s) != 0; } //next, 并判断下一个字符的值 bool next_is_char(char _Chr) { next(); return _Chr == c; } void next_donot_syntax() { c = text[++index]; } //设置允许的语法 void set_flags(unsigned short _Flags) { flags = _Flags; } private: friend class xparser_t; }; template class xdocument_t; template class xelement_t; template class xresource_t { public: //xml节点数据结构,因为结构相互依赖的原因,所以嵌套在一起 using Basetype = typename _Ty::value_type; class xnode; using xtagindex_t = std::list; using xtagindex_ref = typename xtagindex_t::iterator; using xdoctext_t = std::list<_Ty>; using xattrname_t = std::set<_Ty>; using xattrvalue_t = std::map<_Ty, xml_size_t>; using xtagtext_t = std::map<_Ty, xtagindex_t>; using xdoctext_ref = typename xdoctext_t::iterator; using xtagtext_ref = typename xtagtext_t::iterator; using xattrname_ref = typename xattrname_t::iterator; using xattrvalue_ref = typename xattrvalue_t::iterator; class xnode { public: using _Self_Reftype = typename std::list::iterator; xnode() { parent = nullptr; } xnode(xnode *_Parent, xresource_t *_Resource) { parent = _Parent; ti.doc_body_ref = inner.end = inner.begin = _Resource->docs.end(); } private: void refactor_tag_body(int _Style, xml_size_t _PreSize, xresource_t *_Resource) { _Ty &_Tmp = _Resource->refactor_buffer; _Tmp.clear(); _Tmp.reserve(_PreSize); _Tmp += (Basetype)'<'; _Tmp += ti.name->first; for (auto it = attrs.begin(); it != attrs.end(); ++it) { _Tmp += (Basetype)' '; _Tmp += *it->name; _Tmp += (Basetype)'='; _Tmp += (Basetype)it->st; _Tmp += it->value->first; _Tmp += (Basetype)it->st; } if (_Style == 2) _Tmp += (Basetype)'/'; _Tmp += (Basetype)'>'; if (ti.doc_body_ref == _Resource->docs.end()) { _Resource->docs.push_back(_Tmp); ti.doc_body_ref = --(_Resource->docs.end()); parent->inner.end = ti.doc_body_ref; if (parent->inner.begin == _Resource->docs.end()) parent->inner.begin = ti.doc_body_ref; } else { *ti.doc_body_ref = _Tmp; } } private: friend class xresource_t; friend class xparser_t; friend class xparser_t; friend class xparser_t; friend class xdocument_t; friend class xdocument_t; friend class xdocument_t; friend class xelement_t; friend class xelement_t; friend class xelement_t; struct xattr { xattrname_ref name; xattrvalue_ref value; char st; }; struct tag_info { xtagtext_ref name;//标签名称 xtagindex_ref name_index_ref;//在标签名称索引中的引用,本质上其实就是个指针 xdoctext_ref doc_body_ref;//整个标签信息,包含属性 在文档中的实体 }ti; std::list attrs; struct xinner { xdoctext_ref begin, end; }inner; std::list child; xnode *parent; _Self_Reftype self; }; xresource_t() { //预定义的几个转义符实体:lt gt amp quot apos escape_bodys[{ (Basetype)'l', (Basetype)'t'}] = { (Basetype)'<' }; escape_bodys[{ (Basetype)'g', (Basetype)'t' }] = { (Basetype)'>' }; escape_bodys[{ (Basetype)'a', (Basetype)'m', (Basetype)'p' }] = { (Basetype)'&' }; escape_bodys[{ (Basetype)'q', (Basetype)'u', (Basetype)'o', (Basetype)'t' }] = { (Basetype)'"' }; escape_bodys[{ (Basetype)'a', (Basetype)'p', (Basetype)'o', (Basetype)'s' }] = { (Basetype)'\'' }; } void clear() { root.child.clear(); docs.clear(); tags.clear(); attr_names.clear(); attr_values.clear(); root.parent = nullptr; root.ti.doc_body_ref = root.inner.end = root.inner.begin = docs.end(); } xnode root; xdoctext_t docs; xtagtext_t tags; xattrname_t attr_names; xattrvalue_t attr_values; std::map<_Ty, _Ty> escape_bodys; _Ty refactor_buffer; }; template class xparser_t { public: using _StringTy = typename _XtsTy::strtype; using Basetype = typename _XtsTy::Basetype; xparser_t() { // 这里忽悠一下编译器,自动根据类型选择:strstr 或 wcsstr typedef const char *(*STRSTRFUNC)(const char *, const char *); typedef const wchar_t *(*WSTRSTRFUNC)(const wchar_t *, const wchar_t *); typedef const Basetype *(*MYSTRSTRFUNC)(const void *, const void *); __multiec_strstr = ((sizeof(Basetype) == 1) ? ((MYSTRSTRFUNC)((STRSTRFUNC)strstr)) : ((MYSTRSTRFUNC)((WSTRSTRFUNC)wcsstr))); } private: void x_escape_number() { //数值类型的unicode字符转义处理 //这里我是自己实现的字符串转换数字, //因为使用C标准转换需要额外拷贝一次 & 到 ; 字符串,为了避免这个拷贝,就要临时改变转义符结束符 ; 的位置为0来给strtol去计算 //而在之后的dom类的load_string设计中,很可能会直接允许static const char *xxx= "...";这样的东西传入到这里进行解析。 //在windows中,数据段的静态常数成员是的内存页面保护是PAGE_EXECUTE_READ,不能写操作。 //所以我在这里简单实现了字符串 => 数字。 xml_size_t ebgn = xts.index - 1; long long x = 0; if (!xts.next_is_char('x')) { // # 后面如果不是x,就按10进制的规则来处理 if(!(xts.s & XML_SYNTAX::_X_NUMBER)) err(xts.index, 22); xts.set_flags(XML_SYNTAX::_X_NUMBER | XML_SYNTAX::_X_ESCAPEEND); for (;;) { x = (x * 10) + (xts.c - '0'); if (!xts.next_is_flags()) err(xts.index, 22); if (xts.c == ';') break; } } else { // # 后面是x,按16进制处理 xts.set_flags(XML_SYNTAX::_X_HEX); if (!xts.next_is_flags()) err(xts.index, 23); xts.set_flags(XML_SYNTAX::_X_HEX | XML_SYNTAX::_X_ESCAPEEND); int i = 0; for (;; i++) { long long _Tmp; switch (xts.c) { case '0':case '1':case '2':case '3':case '4':case '5':case '6':case '7':case '8':case '9': _Tmp = xts.c - '0'; break; case 'a':case 'b':case 'c':case 'd':case 'e':case 'f': _Tmp = xts.c - 'a' + 10; break; case 'A':case 'B':case 'C':case 'D':case 'E':case 'F': _Tmp = xts.c - 'A' + 10; break; } x += (_Tmp << (i << 2)); if (!xts.next_is_flags()) err(xts.index, 23); if (xts.c == ';') break; } //由于上面的十六进制数字计算顺序是反的,所以要从最高有效位来倒转 long long y = 0; for (int k = 0; k <= i; k++) y += ((x >> (k << 2)) & 0x0F) << ((i - k) << 2); x = y; } if (x < 0x20) { switch (x) { case '\t':case '\r':case '\n': break; default: err(ebgn, 24); } } else if (x > 0xD800 && x < 0xDFFF) err(ebgn, 25); else if (x > 0x10FFFF) err(ebgn, 26); } void x_escape_body() { xml_size_t nbgn = xts.index; for (;;) { xts.next(); if (xts.c == ';') { break; } else { if (!(xts.s & XML_SYNTAX::_X_BEGINNAME)) err(xts.index, 19); } } _StringTy &_Tmp = _strtmp[4]; _Tmp.assign(xts.text + nbgn, xts.index - nbgn); auto it = res->escape_bodys.find(_Tmp); if (it == res->escape_bodys.end()) { errinfobuffer.reserve(_Tmp.length() * 3); int n = sprintf((char*)errinfobuffer.data(), ((sizeof(Basetype) != 1) ? "%ls" : "%s"), _Tmp.c_str()); err(nbgn, 20, errinfobuffer.c_str()); } } void x_escape() { xts.next(); if (xts.c == '#') { x_escape_number(); } else { if (!(xts.s & XML_SYNTAX::_X_BEGINNAME)) err(xts.index, 18); x_escape_body(); } } void x_cdata() { xml_size_t cbgn = xts.index - 2; const char *pcdata = "CDATA["; for (int i = 0; i < 6; i++) { if (!xts.next_is_char(pcdata[i])) err(xts.index, 16); } // CDATA的结束符比注释标签还要省事,直接向后搜索]]> const Basetype *p = __multiec_strstr(xts.text + xts.index + 1, _XtsTy::constval::cdata_end); if (p) { xts.index = (xml_size_t)(p - xts.text + 3); res->docs.push_back(_StringTy(xts.text + cbgn, xts.index - cbgn)); cur->inner.end = --(res->docs.end()); if (cur->inner.begin == res->docs.end()) cur->inner.begin = cur->inner.end; } else { err(cbgn, 17); } } void x_comment() { xml_size_t cbgn = xts.index - 2; if (!xts.next_is_char('-')) err(xts.index, 13); /* 不太清楚为什么xml注释中不允许存在--,我反正照做了。 从代码此处看,实际上是可以允许的,就像是CDATA的结束符那样。 utf8的情况下,无法双字搜索。 utf16的情况下,也无法4字节搜索。 例如这种情况: <--a-->,如果双字搜索,从a开始,有一个-就被忽略掉了,如果要判断这个问题,那实际上和单字节搜索一样的性能。 */ const Basetype *p = __multiec_strstr(xts.text + xts.index + 1, _XtsTy::constval::comment_end); if (p) { if (p[2] == '>') { xts.index = (xml_size_t)(p - xts.text + 3); res->docs.push_back(_StringTy(xts.text + cbgn, xts.index - cbgn)); cur->inner.end = --(res->docs.end()); if (cur->inner.begin == res->docs.end()) cur->inner.begin = cur->inner.end; } else { err((xml_size_t)(p - xts.text), 15); } } else { err(cbgn, 14); } } void x_specifics_tag() { //特殊标签,共有两个分支,注释和CDATA,DTD在根节点之前处理,不会进入这里 xts.set_flags(XML_SYNTAX::_X_LB | XML_SYNTAX::_X_NEGATIVE); if (!xts.next_is_flags()) err(xts.index, 2); if (xts.c == '-') x_comment(); else x_cdata(); } void x_end_node() { //结束标签处理 xts.set_flags(XML_SYNTAX::_X_BEGINNAME); if (!xts.next_is_flags()) err(xts.index, 10); xml_size_t nbgn = xts.index; xml_size_t nend; xts.set_flags(XML_SYNTAX::_X_NAME | XML_SYNTAX::_X_SPACE | XML_SYNTAX::_X_GT); bool _BackSpace = false; for (;;) { if (!xts.next_is_flags()) err(xts.index, 10); if (!(xts.s & XML_SYNTAX::_X_NAME)) { nend = xts.index; if (xts.s & XML_SYNTAX::_X_SPACE) _BackSpace = true; break; } } if (_BackSpace) { //后面还有空格 xts.set_flags(XML_SYNTAX::_X_SPACE | XML_SYNTAX::_X_GT); for (;;) { if (!xts.next_is_flags()) err(xts.index, 11); if (xts.c == '>') break; } } _StringTy &tmp = _strtmp[0]; tmp.reserve(nend - nbgn + 0x10); tmp.assign(xts.text + nbgn, nend - nbgn); if (tmp != cur->ti.name->first) { errinfobuffer.reserve((tmp.length() + cur->ti.name->first.length()) * 3 + 0x20); int n = sprintf((char*)errinfobuffer.data(), ((sizeof(Basetype) != 1) ? "%ls 与 %ls 不一致" : "%s 与 %s 不一致"), tmp.c_str(), cur->ti.name->first.c_str()); err(nbgn, 12, errinfobuffer.c_str()); } if (cur->inner.begin == res->docs.end()) { //如果这个节点的内容为空,说明,它跟一个自结束的节点没有区别 //直接在父节点中将它修改一个自结束节点即可。 cur->parent->inner.end->erase(cur->parent->inner.end->length() - 1); cur->parent->inner.end->append(_XtsTy::constval::autoend_tag_syntax); } else { res->docs.push_back(_XtsTy::constval::end_tag_syntax); auto it = --(res->docs.end()); it->append(tmp); (*it) += (Basetype)'>'; cur->inner.end = it; } cur = cur->parent; } int x_tag_name() { auto new_name = [this](xml_size_t left, xml_size_t right) { _StringTy &tmp = _strtmp[0]; tmp.assign(xts.text + left, right - left); auto it = res->tags.find(tmp); if (it == res->tags.end()) it = res->tags.insert({ tmp, std::list<_Nodetype*>() }).first; it->second.push_back(cur); cur->ti.name = it; cur->ti.name_index_ref = (--(it->second.end())); }; xts.set_flags( XML_SYNTAX::_X_NAME | //符合名称规范的字符 XML_SYNTAX::_X_GT | // > XML_SYNTAX::_X_TAGEND | // /自结束标签 XML_SYNTAX::_X_SPACE // 空白字符 ); xml_size_t name_begin = xts.index; for (;;) { if (!xts.next_is_flags()) err(xts.index, 1); switch (xts.c) { case '>': new_name(name_begin, xts.index); return 1; case '/': if (!xts.next_is_char('>')) err(xts.index, 4); new_name(name_begin, xts.index - 1); return 2; default: if (xts.s & XML_SYNTAX::_X_SPACE) { new_name(name_begin, xts.index); return 0; } break; } } } bool x_attr_name(_StringTy &_Name) { xts.set_flags( XML_SYNTAX::_X_NAME | //符合名称规范的字符 XML_SYNTAX::_X_EQUAL | //等于号 XML_SYNTAX::_X_SPACE // 空白字符 ); xml_size_t name_begin = xts.index; for (;;) { if (!xts.next_is_flags()) err(xts.index, 5); if (xts.s & (XML_SYNTAX::_X_EQUAL | XML_SYNTAX::_X_SPACE)) { _Name.assign(xts.text + name_begin, xts.index - name_begin); return xts.c == '='; } } return false; } char x_attr_value(_StringTy &_Value) { xts.set_flags( XML_SYNTAX::_X_STRING | //字符串 " ' XML_SYNTAX::_X_SPACE // 空白字符 ); char _Style; for (;;) { if (!xts.next_is_flags()) err(xts.index, 7); if (xts.s & XML_SYNTAX::_X_STRING) { _Style = (char)xts.c; break; } } xml_size_t value_begin = xts.index + 1; if (_Style == '"') { for (;;) { xts.next(); switch (xts.c) { case 0: err(xts.index, 8); case '<': err(xts.index, 9); case '&': //处理转义符 x_escape(); break; case '"': //字符串结束 _Value.assign(xts.text + value_begin, xts.index - value_begin); return _Style; default: break; } } } else { for (;;) { xts.next(); switch (xts.c) { case 0: err(xts.index, 8); case '<': err(xts.index, 9); case '&': //处理转义符 x_escape(); break; case '\'': //字符串结束 _Value.assign(xts.text + value_begin, xts.index - value_begin); return _Style; default: break; } } } return _Style; } void x_attr(xml_size_t &_Presize) { _StringTy &name = _strtmp[0]; _StringTy &value = _strtmp[1]; if (!x_attr_name(name)) { //x_attr_name中没有找到等于号,对应这种: xts.set_flags( XML_SYNTAX::_X_EQUAL | //等号 XML_SYNTAX::_X_SPACE // 空白字符 ); for (;;) { if (!xts.next_is_flags()) err(xts.index, 7); if (xts.c == '=') break; } } char _Style = x_attr_value(value); _Presize += (xml_size_t)(name.length() + value.length() + 6); auto itn = res->attr_names.find(name); if (itn == res->attr_names.end()) itn = res->attr_names.insert(name).first; auto itv = res->attr_values.find(value); if (itv == res->attr_values.end()) itv = res->attr_values.insert({ value, 1 }).first; cur->attrs.push_back({ itn, itv, _Style }); } int x_preattr(xml_size_t &_Presize) { /* x_tag_name里没有找到 > 的情况下,在标签属性解析开始之前, 对应下面这几种情况: */ xts.set_flags( XML_SYNTAX::_X_SPACE | //空白字符 XML_SYNTAX::_X_GT | // > XML_SYNTAX::_X_BEGINNAME | //名称首字符 XML_SYNTAX::_X_TAGEND // /自结束标签 ); for (;;) { if (!xts.next_is_flags()) err(xts.index, 5); switch (xts.c) { case '>': return 1; case '/': if (!xts.next_is_char('>')) err(xts.index, 4); return 2; default: if (xts.s & XML_SYNTAX::_X_BEGINNAME) { x_attr(_Presize); xts.set_flags( XML_SYNTAX::_X_SPACE | // 空白字符 XML_SYNTAX::_X_GT | // > XML_SYNTAX::_X_BEGINNAME | // 名称首字符 XML_SYNTAX::_X_TAGEND // /自结束标签 ); } break; } } } void x_new_node() { cur->child.push_back(_Nodetype(cur, res)); auto it = (--cur->child.end()); cur = &(*it); cur->self = it; int n = x_tag_name(); xml_size_t _PreSize = (xml_size_t)(cur->ti.name->first.length() + 3); if (!n) n = x_preattr(_PreSize); cur->refactor_tag_body(n, _PreSize, res); if (n == 2) cur = cur->parent; } void x_tag() { //标签开始后,下一个字符只能是 符合名称规范的第一个字符,感叹号 !,结束标签 / xts.next(); switch (xts.c) { case '!': x_specifics_tag(); break; case '/': x_end_node(); break; default: if (!(xts.s & XML_SYNTAX::_X_BEGINNAME)) err(xts.index, 1); x_new_node(); break; } } void x_text() { //标签之外的有效文本处理 xml_size_t tbegin = _xnf; _StringTy &tmp = _strtmp[3]; tmp.clear(); for (;;) { xts.next(); switch (xts.c) { case 0: return; case '&': if (tbegin == _xnf) tbegin = xts.index; x_escape(); break; case '<': //处理标签之前,先处理有效文本 if (tbegin != _xnf) { if (tmp.length()) tmp += ' '; tmp.append(xts.text + tbegin, xts.index - tbegin); tbegin = _xnf; } if (tmp.length()) { res->docs.push_back(tmp); cur->inner.end = --(res->docs.end()); if (cur->inner.begin == res->docs.end()) cur->inner.begin = cur->inner.end; tmp.clear(); } x_tag(); break; default: if (!(xts.s & XML_SYNTAX::_X_SPACE)) { if (tbegin == _xnf) tbegin = xts.index; } else { //遇到空白字符时,如果有效文本开始位置已经记录过了,则将这一段有效的东西添加到有效文本结 if (tbegin != _xnf) { if (tmp.length()) tmp += ' '; tmp.append(xts.text + tbegin, xts.index - tbegin); tbegin = _xnf; } } } } } void x_dtd() { xml_size_t pos = xts.index - 1; const char *p = "OCTYPE"; for (int i = 0; i < 6; i++) { if (!xts.next_is_char(p[i])) err(pos, 2); } int n = 1; int _StrType = 0; for (;;) { xts.next(); switch (xts.c) { case '"': case '\'': if (!_StrType) _StrType = xts.c; else if (_StrType == xts.c) _StrType = 0; break; case '<': n++; break; case '>': if (!_StrType) { if (!(--n)) { res->docs.push_back(_StringTy(xts.text + pos, xts.index - pos + 1)); cur->inner.end = --(res->docs.end()); if (cur->inner.begin == res->docs.end()) cur->inner.begin = cur->inner.end; //wprintf(L"%s\n", cur->inner.end->c_str()); return; } } break; default: break; } } } void x_declare() { xml_size_t pos = xts.index - 1; int _StrType = 0; for (;;) { xts.next(); switch (xts.c) { case '"': case '\'': if (!_StrType) _StrType = xts.c; else if (_StrType == xts.c) _StrType = 0; break; case '?': if (!_StrType) { if (!xts.next_is_char('>')) err(xts.index, 30); res->docs.push_back(_StringTy(xts.text + pos, xts.index - pos + 1)); cur->inner.end = --(res->docs.end()); if (cur->inner.begin == res->docs.end()) cur->inner.begin = cur->inner.end; return; } break; default: break; } } } int x_root() { if (setjmp(_Rem)) return -1; xts.set_flags(XML_SYNTAX::_X_LT | XML_SYNTAX::_X_SPACE); bool root_break = false; for (;;) { if (xts.c == '<') { xts.next(); switch (xts.c) { case '!': xts.next(); if (xts.c == '-') x_comment(); else if (xts.c == 'D') x_dtd(); else err(xts.index, 2); xts.set_flags(XML_SYNTAX::_X_LT | XML_SYNTAX::_X_SPACE); break; case '?': if (xts.index != 1) err(xts.index, 31); x_declare(); xts.set_flags(XML_SYNTAX::_X_LT | XML_SYNTAX::_X_SPACE); break; default: if (xts.s & XML_SYNTAX::_X_BEGINNAME) { xts.back(1); x_tag(); x_text(); root_break = true; } else { err(xts.index, 2); } break; } if (root_break) break; } if (!xts.next_is_flags()) err(xts.index, 3); } if (cur != &(res->root)) { //解析完字符串之后,如果当前标签不为null,则属于错误。 err(xts.size, 28); } return 0; } void err(xml_size_t _Pos, int _Number, const char *_Info = "") { errp = { _Pos, _Number, _Info }; longjmp(_Rem, 1); } public: int load(const Basetype *_Text, int _Size, xresource_t<_StringTy> *pres) { xts.init((const Basetype*)_Text, _Size); errp.number = 0; errp.pos = 0; res = pres; cur = &(res->root); return x_root(); } void get_errp(xerrorpos &e) { e = errp; } void get_err_pos(xerrorpos &e) { e.line = 0; e.column = 0; if (!e.pos || !e.number) return; auto pos = xts.index - xts.cl; xts.index = 0; xts.c = xts.text[0]; e.line = 1; e.column = 1; for (; xts.index < e.pos; xts.next_donot_syntax()) { if (xts.c == '\n') { e.line++; e.column = 1; } else { e.column++; } } xts.index = pos; xts.next(); } private: friend class xelement_t<_XtsTy>; jmp_buf _Rem; _XtsTy xts; xerrorpos errp; using _Nodetype = typename xresource_t<_StringTy>::xnode; _Nodetype *cur; xresource_t<_StringTy> *res; _StringTy _strtmp[8];//由于使用了jmp_buf来进行错误直接远跳,为了避免内存泄漏,所以将栈中需要的字符串对象也储存在这里 const Basetype*(*__multiec_strstr)(const void*, const void*); std::string errinfobuffer; }; static const char *xml_error_information[] = { "", "开始标签:无效的元素名称", //1 "根节点之前的无效的特殊标签", //2 "根节点之前的无效的字符", //3 "自结束标签:此处应为 >", //4 "标签属性:无效的标签属性名称", //5 "标签属性:此处应为 =", //6 "标签属性:此处应为 \" 或 '", //7 "标签属性:未找到对应的属性结束符(\" 或 ')", //8 "标签属性:< 不允许出现在属性值中", //9 "结束标签:无效的元素名称", //10 "结束标签:此处应为 >", //11 "结束标签:开始标签与结束标签不匹配,参考信息:%s", //12 "注释标签:无效的注释标签,此处或许应为 -",//13 "注释标签:未找到注释标签结束符(-->)",//14 "注释标签:-- 不允许单独出现在注释标签中",//15 "CDATA:无效的CDATA标签",//16 "CDATA:未找到CDATA结束符(]]>)",//17 "转义符:无效的转义符名称首字符",//18 "转义符:无效的转义符字符",//19 "转义符:%s 是未定义的实体",//20 "转义符:无效的转义符字符",//21 "字符数值转义:无效的10进制数字字符",//22 "字符数值转义:无效的16进制数字字符",//23 "字符数值转义:小于32(0x20)的字符仅允许\\t\\r\\n出现在xml中",//24 "字符数值转义:0xD800-0xDFFF为UNICODE代理字符,不允许单独出现在xml中",//25 "字符数值转义:字符值溢出,参考最大值(0x10FFFF)",//26 "转义符:无效的转义符",//27 "根节点未封闭",//28 "无效的文档:%s",//29 "XML声明种错误的符号,此处应为 >",//30 "XML声明前不允许存在其他字符",//31 "未找到XML声明结束符(?>)",//32 }; template class xelement_t { public: using _StringTy = typename _XtsTy::strtype; using _Nodetype = typename xresource_t<_StringTy>::xnode; using Basetype = typename xresource_t<_StringTy>::Basetype; bool eof() { return _Node == nullptr; } xelement_t(_Nodetype *_Val) { _Node = _Val; } bool operator==(xelement_t &e) { return e._Node == _Node; } bool operator!=(xelement_t &e) { return e._Node != _Node; } _StringTy get_name() { if (eof()) return _XtsTy::constval::emp; return _Node->ti.name->first; } _StringTy get_attr(const _StringTy &_AttrName) { if (eof()) return _XtsTy::constval::emp; for (auto it = _Node->attrs.begin(); it != _Node->attrs.end(); ++it) { if (*(it->name) == _AttrName) return it->value->first; } return ""; } _StringTy get_text(int _Flags = 0) { if (eof()) return _XtsTy::constval::emp; _StringTy _Tmp; auto begin = _Node->inner.begin; auto end = _Node->inner.end; if (_Flags & 1) { --begin; ++end; } for (auto it = begin; it != end; ++it) { if (it->length() > 6 && it->at(0) == '<' && it->at(1) == '!' && it->at(2) == '-') continue; _Tmp += it->c_str(); } return _Tmp; } _StringTy get_inner_xml() { if (eof()) return _XtsTy::constval::emp; _StringTy _Tmp; for (auto it = _Node->inner.begin; it != _Node->inner.end; ++it) { if (it->length() > 3 && it->at(0) == '<' && it->at(1) == '!' && it->at(2) == '-') continue; if (it->length() > 4 && *it == _XtsTy::constval::br_tag) { _Tmp += _XtsTy::constval::crlf; continue; } if (it->length() > 1 && it->at(0) == '<' && it->at(1) != '/') _Tmp += _XtsTy::constval::crlf; _Tmp += it->c_str(); } return _Tmp; } private: friend class xdocument_t<_XtsTy>; _Nodetype *_Node; }; template class xdocument_t { public: xdocument_t() { nodepath_array.reserve(0x10); } ~xdocument_t() { res.clear(); } using _StringTy = typename _XtsTy::strtype; using _ParserTy = xparser_t<_XtsTy>; using Basetype = typename _XtsTy::Basetype; using _ResourceTy = xresource_t<_StringTy>; using _TagIndexTy = typename _ResourceTy::xtagindex_t; using element = xelement_t<_XtsTy>; using _Nodetype = typename element::_Nodetype; int load_file(const _StringTy &_Filename) { errp.pos = 0; errp.line = 0; errp.column = 0; res.clear(); std::ifstream fs(_Filename.c_str(), std::ios::binary); fs.seekg(0, std::ios::end); size_t s = (size_t)fs.tellg(); fs.seekg(0, std::ios::beg); if (!s) { errp.information.reserve(_Filename.length() * 3); sprintf((char*)errp.information.data(), (sizeof(Basetype) != 1) ? "%ls" : "%s", _Filename.c_str()); errp.number = 29; errp.pos = 0; return -1; } char *p = new char[s + 2]; p[s] = 0; p[s + 1] = 0; fs.read(p, s); fs.close(); size_t _Off = 0; //预测文档编码,并不一定准确,只能说想到的判断都做了。 /*返回值有4种: 0 多字节编码非utf-8 1 utf-16 2 utf-8 -1 错误 */ #if defined(_WIN32) || defined(_WIN64) _SrcEncode = encode_adaptive::xmlec_predict(p, s, &(errp.number), &_Off); if (_SrcEncode < 0) { delete p; errp.information.reserve(_Filename.length() * 3); sprintf((char*)errp.information.data(), (sizeof(Basetype) != 1) ? "%ls" : "%s", _Filename.c_str()); return -1; } //很遗憾的事情是,c++17删除了编码转换库,所以,只能使用操作系统的函数来完成了。 //虽然这个类库并不依赖c++17,但为了以后和新标准对接,所以只能自己实现跨平台的转换策略。 //另外一点是,linux其实对转码没有什么需求。 _StringTy _Text; if (encode_adaptive::specifiy(p + _Off, _SrcEncode, _XtsTy::_encoding, _Text) == _nf) { delete p; errp.information.reserve(_Filename.length() * 3); sprintf((char*)errp.information.data(), (sizeof(Basetype) != 1) ? "%ls" : "%s", _Filename.c_str()); errp.number = 29; return -1; } delete p; _ParserTy xp; int _Result = xp.load(_Text.c_str(), (xml_size_t)s, &res); #else _ParserTy xp; int _Result = xp.load(p, (xml_size_t)s, &res); delete p; #endif res.root.inner.end = res.docs.end(); xp.get_errp(errp); if (errp.number) xp.get_err_pos(errp); return _Result; } element get_element(const _StringTy &_TagName) { size_t _Off = 0; size_t _Pos; Basetype *_Ptr = (Basetype *)_TagName.c_str(); nodepath_array.clear(); auto i = res.tags.end(); for (;;) { _Pos = _TagName.find('/', _Off); if (_Pos == _nf) break; _Ptr[_Pos] = 0; i = res.tags.find(_Ptr + _Off); _Ptr[_Pos] = '/'; if (i == res.tags.end()) return nullptr; nodepath_array.push_back(&(i->second)); _Off = _Pos + 1; } i = res.tags.find(_Ptr + _Off); if (i == res.tags.end()) return nullptr; if (!nodepath_array.size()) return *(i->second.begin()); nodepath_array.push_back(&(i->second)); return recursive_nodepath(nullptr, 0); } element get_element(element &_Parent, const _StringTy &_TagName) { auto fit = res.tags.find(_TagName); if (fit != res.tags.end()) { for (auto it = fit->second.begin(); it != fit->second.end(); ++it) { if (_Parent->_Node == it->_Node) return it; } } return nullptr; } std::string get_error_info() { char buf[256]; std::string _Result; if (errp.pos != 0) { sprintf(buf, "XML错误位于 行(%d), 列(%d):", errp.line, errp.column); _Result += buf; } sprintf(buf, xml_error_information[errp.number], errp.information.c_str()); _Result += buf; return _Result; } element root() { return &(res.root); } element end() { return nullptr; } private: _Nodetype *recursive_nodepath(_Nodetype *_Parent, size_t i) { _TagIndexTy *pti = nodepath_array[i]; auto _next = i + 1; if (_next == nodepath_array.size()) { for (auto it = pti->begin(); it != pti->end(); ++it) { if (!i || (*it)->parent == _Parent) return *it; } } else { for (auto it = pti->begin(); it != pti->end(); ++it) { if (!i || (*it)->parent == _Parent) { _Nodetype *p = recursive_nodepath(*it, _next); if (p) return p; } } } return (_Nodetype *)nullptr; } private: _ResourceTy res; xerrorpos errp; int _SrcEncode; std::vector<_TagIndexTy*> nodepath_array; }; } #if defined(_WIN32) || defined(_WIN64) template using xdoc = aqx_internal::xdocument_t<_Ty>; using xts_utf8 = aqx_internal::xts_utf8; using xts_utf16 = aqx_internal::xts_utf16; using xts_asc = aqx_internal::xts_asc; #else using xdoc = aqx_internal::xdocument_t; #endif } #pragma warning(pop)
//encode_adaptive.h - windows only
#pragma once
#include <string>
#include "tcvt.h"

#ifndef _nf
#define _nf ((size_t)-1)
#endif
namespace aqx {

    namespace encode_adaptive {


        static constexpr auto unknow{ static_cast<int>(-1) };
        static constexpr auto sys{ static_cast<int>(0) };
        static constexpr auto utf16{ static_cast<int>(1) };
        static constexpr auto utf8{ static_cast<int>(2) };
        static int profile_predict(unsigned char *_Text, size_t _Size, int &_Off, int _Def = 0) {

            if (_Size >= 3) {
                if (_Text[0] == 0xEF &&
                    _Text[1] == 0xBB &&
                    _Text[2] == 0xBF) {
                    _Off = 3;
                    return 2;
                }
            }
            if (_Size >= 2) {
                if (_Text[0] == 0xFF && _Text[1] == 0xFE) {
                    _Off = 2;
                    return 1;
                }
            }

            _Off = 0;
            size_t s = _Size;
            if (s > 0x10)
                s = 0x10;
            int x = 0;
            for (size_t i = 0; i < s; i++) {
                if (_Text[i] == 0)
                    x++;
            }

            if (_Size == s && x == 1)
                return _Def;
            if (!x)
                return _Def;
            return 1;
        }

        template
        static int profile_adaptive(char *_Text, size_t _Size, _Ty &_Result, int _Def = 0) {
            int _StartOff = 0;
            int _SrcCode = encode_adaptive::profile_predict((unsigned char*)_Text, _Size, _StartOff, _Def);
            size_t _TargetCode = 0;
            if (sizeof(decltype(*_Result.c_str())) == 2)
                _TargetCode = 1;
            std::wstring _utf16;
            if (_SrcCode == 2)
                aqx::utf16_from_utf8(_utf16, _Text + _StartOff);
            else if (_SrcCode == 1)
                _utf16 = (wchar_t*)(_Text + _StartOff);
            else
                aqx::utf16_from_asc(_utf16, _Text + _StartOff);
            auto _proc0 = [](void *_Res, std::wstring &_wstr) { asc_from_utf16(*(std::string*)_Res, _wstr); };
            auto _proc1 = [](void *_Res, std::wstring &_wstr) { *(std::wstring*)(_Res) = _wstr; };
            auto _proc2 = [](void *_Res, std::wstring &_wstr) { aqx::utf8_from_utf16(*(std::string*)(_Res), _wstr); };

            if (_TargetCode == 0)
                _proc0(&_Result, _utf16);
            else
                _proc1(&_Result, _utf16);
            return _SrcCode;
        }

        template
        static size_t specifiy(char *_Text, int _Srcec, int _Targetec, _Ty &_Result) {
            if (sizeof(_Ty::_Mybase::_Alty::value_type) == 1 && _Targetec == 1)
                return _nf;
            if (sizeof(_Ty::_Mybase::_Alty::value_type) == 2 && _Targetec != 1)
                return _nf;
            if (_Srcec == 2) {

                if (_Targetec == 2)
                {
                    *(std::string*)&_Result = (_Text);
                    return _Result.length();
                }
                else if (_Targetec == 1)
                    return utf16_from_utf8(*(std::wstring*)&_Result, _Text);
                else
                    return asc_from_utf8(*(std::string*)&_Result, _Text);

            }
            else if (_Srcec == 1)
            {
                if (_Targetec == 2)
                    return utf8_from_utf16(*(std::string*)&_Result, (wchar_t*)_Text);
                else if (_Targetec == 1) {
                    *(std::wstring*)&_Result = (wchar_t*)(_Text);
                    return _Result.length();
                }
                else
                    return asc_from_utf16(*(std::string*)&_Result, (wchar_t*)_Text);
            }
            else
            {
                if (_Targetec == 2)
                    return utf8_from_asc(*(std::string*)&_Result, _Text);
                else if (_Targetec == 1)
                    return utf16_from_asc(*(std::wstring*)&_Result, _Text);
                else {
                    *(std::string*)&_Result = (_Text);
                    return _Result.length();
                }
            }
            return _nf;
        }


        static void unknow_append(void *_Res, std::string _Str) { *(std::string*)(_Res) += _Str; }
        static void unknow_wappend(void *_Res, std::wstring _Str) { *(std::wstring*)(_Res) += _Str; }





        static int xmlec_nbom_wchar(wchar_t *_Text, size_t _Size) {
            if (_Size < 7) return -1;//小于7字节的xml文档是不成立的
            auto p = wcschr(_Text, L'<');
            if (!p) return -1;
            if (p[1] == L'?') {
                if (p != _Text) return -3;//xml声明没有位于xml文件头部
                p = wcsstr(_Text + 2, L"?>");
                if (!p) return -4;//没有找到xml声明结尾
            }
            return 1;
        }

        static int xmlec_nbom_char(char *_Text, size_t _Size) {
            auto p = strchr(_Text, '<');
            if (!p) return -1;

            if (!p[1]) //找到第一个<,如果他它之后一个字符是0,则考虑它是不是utf16
            {
                if (p - _Text == _Size - 1) return -2;//如果它已经是字符串最后一个有效字符,直接报错。
                if (_Size % 2) return -2; //长度不是偶数,说明绝对不可能是utf16
                return xmlec_nbom_wchar((wchar_t*)_Text, (_Size >> 1));
            }

            if (p[1] == '?') {
                if (p != _Text) return -3;//xml声明没有位于xml文件头部
                p = strstr(_Text + 2, "?>");
                if (!p) return -4;//没有找到xml声明结尾
                auto s = (p - _Text) + 2;
                std::string str(_Text, p - _Text + 2);
                std::transform(str.begin(), str.end(), str.begin(), toupper);
                if (str.find("UTF-8") != _nf) return 2;
                if (str.find("GBK") != _nf) return 0;
                if (str.find("GB2312") != _nf) return 0;
            }

            return 2;
        }

        static int xmlec_predict(char *_Text, size_t _Size, int *err_number, size_t *_Off = NULL, int _Default = 2) {
            *err_number = 0;
            if (_Size < 7) {
                //小于7字节的xml文档是不成立的
                *err_number = 29;
                return -1;
            }

            //先基于bom判断
            if ((unsigned char)(_Text[0]) == 0xEF && (unsigned char)(_Text[1]) == 0xBB && (unsigned char)(_Text[2]) == 0xBF) {
                if (_Off) *_Off = 3;
                auto p = strchr(_Text + 3, '<');
                if (!p) {
                    *err_number = 29;
                    return -1;
                }

                if (p[1] == '?')
                {
                    if (p != _Text + 3) {

                        *err_number = 31;
                        return -1;
                    }
                    p = strstr(_Text + 5, "?>");
                    if (!p) {
                        *err_number = 32;
                        return -1;
                    }
                }

                return 2;
            }
            else if ((unsigned char)(_Text[0]) == 0xFF && (unsigned char)(_Text)[1] == 0xFE) {
                if (_Off) *_Off = 2;
                auto p = wcschr((wchar_t*)_Text + 1, L'<');
                if (!p) {
                    *err_number = 29;
                    return -1;
                }

                if (p[1] == L'?')
                {
                    if (p != (wchar_t*)_Text + 1) {
                        *err_number = 31;
                        return -1;
                    }
                    p = wcsstr((wchar_t*)_Text + 3, L"?>");
                    if (!p) {
                        *err_number = 32;
                        return -1;
                    }

                }
                return 1;
            }

            if (_Off) *_Off = 0;
            int n = xmlec_nbom_char(_Text, _Size);
            if (n < -1) {

                if (n == -2)
                    *err_number = 29;
                else if (n == -3)
                    *err_number = 31;
                else if (n == -4)
                    *err_number = 32;

                return -1;
            }
            else if (n >= 0) return n;
            if (!(_Size % 2))
                n = xmlec_nbom_wchar((wchar_t*)_Text, (_Size >> 1));

            if (n < -1) {

                if (n == -2)
                    *err_number = 29;
                else if (n == -3)
                    *err_number = 31;
                else if (n == -4)
                    *err_number = 32;

                return -1;
            }

            return _Default;
        }


    };

}
//tcvt.h - windows only

#pragma once
#if defined(_WIN32) || defined(_WIN64)
#ifndef _WINDOWS_
#include 
#endif
#endif

namespace aqx {

    static size_t _mbs2wcs(int _Cp, const std::string &_Mbs, std::wstring &_Wcs) {
        int n = MultiByteToWideChar(_Cp, 0, _Mbs.c_str(), (int)_Mbs.length(), nullptr, 0);
        _Wcs.resize(n);
        return MultiByteToWideChar(_Cp, 0, _Mbs.c_str(), (int)_Mbs.length(), (wchar_t*)_Wcs.data(), (int)_Wcs.capacity());
    }

    static size_t _wcs2mbs(int _Cp, const std::wstring &_Wcs, std::string &_Mbs) {
        int n = WideCharToMultiByte(_Cp, 0, _Wcs.c_str(), (int)_Wcs.length(), nullptr, 0, NULL, FALSE);
        _Mbs.resize(n);
        return WideCharToMultiByte(_Cp, 0, _Wcs.c_str(), (int)_Wcs.length(), (char*)_Mbs.data(), (int)_Mbs.capacity(), NULL, FALSE);
    }

    
    static size_t utf8_from_asc(std::string &_Result, const std::string &_Asc) {
        std::wstring _Tmp;
        _mbs2wcs(CP_ACP, _Asc, _Tmp);
        return _wcs2mbs(CP_UTF8, _Tmp, _Result);
    }

    static size_t utf16_from_asc(std::wstring &_Result, const std::string &_Asc) {
        return _mbs2wcs(CP_ACP, _Asc, _Result);
    }

    static size_t asc_from_utf8(std::string &_Result, const std::string &_U8s) {
        std::wstring _Tmp;
        _mbs2wcs(CP_UTF8, _U8s, _Tmp);
        return _wcs2mbs(CP_ACP, _Tmp, _Result);
    }

    static size_t utf16_from_utf8(std::wstring &_Result, const std::string &_U8s) {
        return _mbs2wcs(CP_UTF8, _U8s, _Result);
    }

    static size_t utf8_from_utf16(std::string &_Result, const std::wstring &_Wcs) {
        return _wcs2mbs(CP_UTF8, _Wcs, _Result);
    }

    static size_t asc_from_utf16(std::string &_Result, const std::wstring &_Wcs) {
        return _wcs2mbs(CP_ACP, _Wcs, _Result);
    }

}

测试代码:

#include "pch.h"
#include 
#include "xml.hpp"
#include 

int main()
{
    
    setlocale(LC_ALL, "");

    // 支持三种编码格式:aqx::xts_utf16 aqx::xts_utf8 aqx::xts_asc
    aqx::xdoc doc;
    auto t = clock();
    int err = doc.load_file(L"G:\\vs2017\\test\\生成\\test.xml");
    printf("解析文档耗时:%d ms\n", clock() - t);
    if (err) {
        printf("%s\n", doc.get_error_info().c_str());
        return 0;
    }
    auto e = doc.get_element(L"CATALOG2");
    printf("%ls\n", e.get_inner_xml().c_str());
    system("pause");
    return 0;
}