使用C#对大体积XML文件进行格式化的算法与实现
郝伟 2021/02/01

1 功能简介

本文章提供了一个函数,通过流和缓冲,实现任意大小的XML文件的格式化。

格式化好的XML文件满足以下几个条件:

  1. 每行只有至多一对标签或者一个关闭的标签;
    <text>hello</text>
  2. 每一级缩进使用1个 /t 表示;
  3. 必需是UTF8编码,如果不是请先转码。

2 输入数据

<?xml version=1.0><root>	<note>		<to>George</to>
		<from>John</from>
		<heading>Reminder</heading>
		<body>Don't forget the meeting!</body>
    </note>	<note>		<to>Tom</to>
		<from>Micle</from>		<heading>Reminder</heading>
		<body>Don't forget the meeting! how are you</body>
		<remark pt='noll'/>
    </note>
</root>

3 输出结果

<?xml version=1.0>
<root>
	<note>
		<to>George</to>
		<from>John</from>
		<heading>Reminder</heading>
		<body>Don't forget the meeting!</body>
	</note>
	<note>
		<to>Tom</to>
		<from>Micle</from>
		<heading>Reminder</heading>
		<body>Don't forget the meeting! how are you</body>
		<remark pt="noll"/>
	</note>
</root>

4 测试代码

		/// <summary>
		/// 测试解析是否成功。
		/// </summary>
		public static void XmlFormatTest()
		{
			var xmlfile = "sample.xml";
			File.WriteAllText(xmlfile, @"<?xml version=1.0><root>	<note>		<to>George</to>
		<from>John</from>
		<heading>Reminder</heading>
		<body>Don't forget the meeting!</body>
    </note>	<note>		<to>Tom</to>
		<from>Micle</from>		<heading>Reminder</heading>
		<body>Don't forget the meeting! how are you</body>
		<remark pt='noll'/>
    </note>
</root>");
			var res = XmlHelper.LargeXmlFormatter(xmlfile, xmlfile + "_out.xml");
			Process.Start("explorer.exe", "/e,/select," + Path.GetFullPath(xmlfile + "_out.xml"));
		}

5 源代码

		/// <summary>
		/// 将输入的XML进行格式化。格式化的几点内容:
		/// 1. 每行只有至多一对标签,可以是一对标签的或者一个关闭的标签;
		/// 2. 每一级使用/t进行缩进
		/// 3. 必需是UTF8编码,如果不是请先转码。
		/// </summary>
		/// <param name="xmlfile">待格式化的XML文件。</param>
		/// <param name="outfile">输出的格式化好的XML文件。</param>
		/// <returns></returns>
		public static string LargeXmlFormatter(string xmlfile, string outfile)
		{
			if (!File.Exists(xmlfile))
			{
				throw new FileNotFoundException($"文件 `{xmlfile}` 读取失败。", xmlfile);
			}

			// 1表示在标签中,2表示在文本中
			int state = 0;
			// 以下4个buffer分别表示:标签、文本、输出和前缀。
			StringBuilder tagBuffer = new StringBuilder();
			StringBuilder textBuffer = new StringBuilder();
			StringBuilder prefix = new StringBuilder();
			StringBuilder outBuffer = null; // new StringBuilder();

			StreamReader sr = new StreamReader(new FileStream(xmlfile, FileMode.Open), Encoding.UTF8);
			StreamWriter sw = new StreamWriter(outfile, false, Encoding.UTF8);
			char[] buffer = new char[30]; // 每次读取1M的内容

			while (!sr.EndOfStream)
			{
				int len = sr.ReadBlock(buffer, 0, buffer.Length);
				for (int i = 0; i < len; i++)
				{
					char ch = buffer[i];
					switch (ch)
					{
						case '<':
							var text = textBuffer.Replace('\n', ' ').ToString().Trim();
							textBuffer.Clear();
							textBuffer.Append(text);
							if (textBuffer.Length > 0)
							{
								outBuffer.Append(text);
							}
							// 避免在第一行加入的空行
							else if (outBuffer == null)
							{
								outBuffer = new StringBuilder();
							}
							else
							{
								outBuffer.Append('\n');
							}
							state = 1;
							tagBuffer.Clear();
							tagBuffer.Append('<');
							break;

						case '>':
							state = 2;
							tagBuffer.Append('>');
							switch (tagBuffer[1])
							{
								case '?':
									outBuffer.Append(tagBuffer.ToString());
									break;

								case '/':
									if (prefix.Length > 0)
										prefix.Remove(0, 1);
									outBuffer.Append((textBuffer.Length > 0 ? "" : prefix.ToString()) + tagBuffer.ToString());
									textBuffer.Clear();
									break;

								default:
									// 添加一个普通的开始标签
									outBuffer.Append(prefix + tagBuffer.ToString());

									// 如果是关闭的标签,则添加一个/t,以抵消后面的加入。
									if (tagBuffer[tagBuffer.Length - 2] != '/')
										prefix.Append('\t');

									state = 2;
									tagBuffer.Clear();
									break;
							}
							break;
						case '\r': break; // 不处理
						default:
							if (state == 1)
								tagBuffer.Append(ch);
							else if (state == 2)
								textBuffer.Append(ch);
							break;
					}
				}
				sw.Write(outBuffer.ToString());
				outBuffer.Clear();
			}
			sw.Close();
			sr.Close();


			return "";
		}
	}