使用C#对大体积XML文件进行格式化的算法与实现
郝伟 2021/02/01
本文章提供了一个函数,通过流和缓冲,实现任意大小的XML文件的格式化。
格式化好的XML文件满足以下几个条件:
<text>hello</text> 或 /t 表示;<?xml version=1.0><root> <note> <to>George</to> <from>John</from> <heading>Reminder</heading> <body>Don't forget the meeting!</body> </note> <note> <to>Tom</to> <from>Micle</from> <heading>Reminder</heading> <body>Don't forget the meeting! how are you</body> <remark pt='noll'/> </note> </root>
<?xml version=1.0> <root> <note> <to>George</to> <from>John</from> <heading>Reminder</heading> <body>Don't forget the meeting!</body> </note> <note> <to>Tom</to> <from>Micle</from> <heading>Reminder</heading> <body>Don't forget the meeting! how are you</body> <remark pt="noll"/> </note> </root>
/// <summary> /// 测试解析是否成功。 /// </summary> public static void XmlFormatTest() { var xmlfile = "sample.xml"; File.WriteAllText(xmlfile, @"<?xml version=1.0><root> <note> <to>George</to> <from>John</from> <heading>Reminder</heading> <body>Don't forget the meeting!</body> </note> <note> <to>Tom</to> <from>Micle</from> <heading>Reminder</heading> <body>Don't forget the meeting! how are you</body> <remark pt='noll'/> </note> </root>"); var res = XmlHelper.LargeXmlFormatter(xmlfile, xmlfile + "_out.xml"); Process.Start("explorer.exe", "/e,/select," + Path.GetFullPath(xmlfile + "_out.xml")); }
/// <summary> /// 将输入的XML进行格式化。格式化的几点内容: /// 1. 每行只有至多一对标签,可以是一对标签的或者一个关闭的标签; /// 2. 每一级使用/t进行缩进 /// 3. 必需是UTF8编码,如果不是请先转码。 /// </summary> /// <param name="xmlfile">待格式化的XML文件。</param> /// <param name="outfile">输出的格式化好的XML文件。</param> /// <returns></returns> public static string LargeXmlFormatter(string xmlfile, string outfile) { if (!File.Exists(xmlfile)) { throw new FileNotFoundException($"文件 `{xmlfile}` 读取失败。", xmlfile); } // 1表示在标签中,2表示在文本中 int state = 0; // 以下4个buffer分别表示:标签、文本、输出和前缀。 StringBuilder tagBuffer = new StringBuilder(); StringBuilder textBuffer = new StringBuilder(); StringBuilder prefix = new StringBuilder(); StringBuilder outBuffer = null; // new StringBuilder(); StreamReader sr = new StreamReader(new FileStream(xmlfile, FileMode.Open), Encoding.UTF8); StreamWriter sw = new StreamWriter(outfile, false, Encoding.UTF8); char[] buffer = new char[30]; // 每次读取1M的内容 while (!sr.EndOfStream) { int len = sr.ReadBlock(buffer, 0, buffer.Length); for (int i = 0; i < len; i++) { char ch = buffer[i]; switch (ch) { case '<': var text = textBuffer.Replace('\n', ' ').ToString().Trim(); textBuffer.Clear(); textBuffer.Append(text); if (textBuffer.Length > 0) { outBuffer.Append(text); } // 避免在第一行加入的空行 else if (outBuffer == null) { outBuffer = new StringBuilder(); } else { outBuffer.Append('\n'); } state = 1; tagBuffer.Clear(); tagBuffer.Append('<'); break; case '>': state = 2; tagBuffer.Append('>'); switch (tagBuffer[1]) { case '?': outBuffer.Append(tagBuffer.ToString()); break; case '/': if (prefix.Length > 0) prefix.Remove(0, 1); outBuffer.Append((textBuffer.Length > 0 ? "" : prefix.ToString()) + tagBuffer.ToString()); textBuffer.Clear(); break; default: // 添加一个普通的开始标签 outBuffer.Append(prefix + tagBuffer.ToString()); // 如果是关闭的标签,则添加一个/t,以抵消后面的加入。 if (tagBuffer[tagBuffer.Length - 2] != '/') prefix.Append('\t'); state = 2; tagBuffer.Clear(); break; } break; case '\r': break; // 不处理 default: if (state == 1) tagBuffer.Append(ch); else if (state == 2) textBuffer.Append(ch); break; } } sw.Write(outBuffer.ToString()); outBuffer.Clear(); } sw.Close(); sr.Close(); return ""; } }