使用C#对大体积XML文件进行格式化的算法与实现 郝伟 2021/02/01 [TOC]

1. 1 功能简介

本文章提供了一个函数,通过流和缓冲,实现任意大小的XML文件的格式化。

格式化好的XML文件满足以下几个条件:

  1. 每行只有至多一对标签或者一个关闭的标签; 如 <text>hello</text>
  2. 每一级缩进使用1个 /t 表示;
  3. 必需是UTF8编码,如果不是请先转码。

2. 2 输入数据

<?xml version=1.0><root>    <note>        <to>George</to>
        <from>John</from>
        <heading>Reminder</heading>
        <body>Don't forget the meeting!</body>
    </note>    <note>        <to>Tom</to>
        <from>Micle</from>        <heading>Reminder</heading>
        <body>Don't forget the meeting! how are you</body>
        <remark pt='noll'/>
    </note>
</root>

3. 3 输出结果

<?xml version=1.0>
<root>
    <note>
        <to>George</to>
        <from>John</from>
        <heading>Reminder</heading>
        <body>Don't forget the meeting!</body>
    </note>
    <note>
        <to>Tom</to>
        <from>Micle</from>
        <heading>Reminder</heading>
        <body>Don't forget the meeting! how are you</body>
        <remark pt="noll"/>
    </note>
</root>

4. 4 测试代码

        /// <summary>
        /// 测试解析是否成功。
        /// </summary>
        public static void XmlFormatTest()
        {
            var xmlfile = "sample.xml";
            File.WriteAllText(xmlfile, @"<?xml version=1.0><root>    <note>        <to>George</to>
        <from>John</from>
        <heading>Reminder</heading>
        <body>Don't forget the meeting!</body>
    </note>    <note>        <to>Tom</to>
        <from>Micle</from>        <heading>Reminder</heading>
        <body>Don't forget the meeting! how are you</body>
        <remark pt='noll'/>
    </note>
</root>");
            var res = XmlHelper.LargeXmlFormatter(xmlfile, xmlfile + "_out.xml");
            Process.Start("explorer.exe", "/e,/select," + Path.GetFullPath(xmlfile + "_out.xml"));
        }

5. 5 源代码


        /// <summary>
        /// 将输入的XML进行格式化。格式化的几点内容:
        /// 1. 每行只有至多一对标签,可以是一对标签的或者一个关闭的标签;
        /// 2. 每一级使用/t进行缩进
        /// 3. 必需是UTF8编码,如果不是请先转码。
        /// </summary>
        /// <param name="xmlfile">待格式化的XML文件。</param>
        /// <param name="outfile">输出的格式化好的XML文件。</param>
        /// <returns></returns>
        public static string LargeXmlFormatter(string xmlfile, string outfile)
        {
            if (!File.Exists(xmlfile))
            {
                throw new FileNotFoundException($"文件 `{xmlfile}` 读取失败。", xmlfile);
            }

            // 1表示在标签中,2表示在文本中
            int state = 0;
            // 以下4个buffer分别表示:标签、文本、输出和前缀。
            StringBuilder tagBuffer = new StringBuilder();
            StringBuilder textBuffer = new StringBuilder();
            StringBuilder prefix = new StringBuilder();
            StringBuilder outBuffer = null; // new StringBuilder();

            StreamReader sr = new StreamReader(new FileStream(xmlfile, FileMode.Open), Encoding.UTF8);
            StreamWriter sw = new StreamWriter(outfile, false, Encoding.UTF8);
            char[] buffer = new char[30]; // 每次读取1M的内容

            while (!sr.EndOfStream)
            {
                int len = sr.ReadBlock(buffer, 0, buffer.Length);
                for (int i = 0; i < len; i++)
                {
                    char ch = buffer[i];
                    switch (ch)
                    {
                        case '<':
                            var text = textBuffer.Replace('\n', ' ').ToString().Trim();
                            textBuffer.Clear();
                            textBuffer.Append(text);
                            if (textBuffer.Length > 0)
                            {
                                outBuffer.Append(text);
                            }
                            // 避免在第一行加入的空行
                            else if (outBuffer == null)
                            {
                                outBuffer = new StringBuilder();
                            }
                            else
                            {
                                outBuffer.Append('\n');
                            }
                            state = 1;
                            tagBuffer.Clear();
                            tagBuffer.Append('<');
                            break;

                        case '>':
                            state = 2;
                            tagBuffer.Append('>');
                            switch (tagBuffer[1])
                            {
                                case '?':
                                    outBuffer.Append(tagBuffer.ToString());
                                    break;

                                case '/':
                                    if (prefix.Length > 0)
                                        prefix.Remove(0, 1);
                                    outBuffer.Append((textBuffer.Length > 0 ? "" : prefix.ToString()) + tagBuffer.ToString());
                                    textBuffer.Clear();
                                    break;

                                default:
                                    // 添加一个普通的开始标签
                                    outBuffer.Append(prefix + tagBuffer.ToString());

                                    // 如果是关闭的标签,则添加一个/t,以抵消后面的加入。
                                    if (tagBuffer[tagBuffer.Length - 2] != '/')
                                        prefix.Append('\t');

                                    state = 2;
                                    tagBuffer.Clear();
                                    break;
                            }
                            break;
                        case '\r': break; // 不处理
                        default:
                            if (state == 1)
                                tagBuffer.Append(ch);
                            else if (state == 2)
                                textBuffer.Append(ch);
                            break;
                    }
                }
                sw.Write(outBuffer.ToString());
                outBuffer.Clear();
            }
            sw.Close();
            sr.Close();


            return "";
        }
    }

results matching ""

    No results matching ""