使用C#对大体积XML文件进行格式化的算法与实现 郝伟 2021/02/01 [TOC]
1. 1 功能简介
本文章提供了一个函数,通过流和缓冲,实现任意大小的XML文件的格式化。
格式化好的XML文件满足以下几个条件:
- 每行只有至多一对标签或者一个关闭的标签;
如
<text>hello</text>或; - 每一级缩进使用1个
/t表示; - 必需是UTF8编码,如果不是请先转码。
2. 2 输入数据
<?xml version=1.0><root> <note> <to>George</to>
<from>John</from>
<heading>Reminder</heading>
<body>Don't forget the meeting!</body>
</note> <note> <to>Tom</to>
<from>Micle</from> <heading>Reminder</heading>
<body>Don't forget the meeting! how are you</body>
<remark pt='noll'/>
</note>
</root>
3. 3 输出结果
<?xml version=1.0>
<root>
<note>
<to>George</to>
<from>John</from>
<heading>Reminder</heading>
<body>Don't forget the meeting!</body>
</note>
<note>
<to>Tom</to>
<from>Micle</from>
<heading>Reminder</heading>
<body>Don't forget the meeting! how are you</body>
<remark pt="noll"/>
</note>
</root>
4. 4 测试代码
/// <summary>
/// 测试解析是否成功。
/// </summary>
public static void XmlFormatTest()
{
var xmlfile = "sample.xml";
File.WriteAllText(xmlfile, @"<?xml version=1.0><root> <note> <to>George</to>
<from>John</from>
<heading>Reminder</heading>
<body>Don't forget the meeting!</body>
</note> <note> <to>Tom</to>
<from>Micle</from> <heading>Reminder</heading>
<body>Don't forget the meeting! how are you</body>
<remark pt='noll'/>
</note>
</root>");
var res = XmlHelper.LargeXmlFormatter(xmlfile, xmlfile + "_out.xml");
Process.Start("explorer.exe", "/e,/select," + Path.GetFullPath(xmlfile + "_out.xml"));
}
5. 5 源代码
/// <summary>
/// 将输入的XML进行格式化。格式化的几点内容:
/// 1. 每行只有至多一对标签,可以是一对标签的或者一个关闭的标签;
/// 2. 每一级使用/t进行缩进
/// 3. 必需是UTF8编码,如果不是请先转码。
/// </summary>
/// <param name="xmlfile">待格式化的XML文件。</param>
/// <param name="outfile">输出的格式化好的XML文件。</param>
/// <returns></returns>
public static string LargeXmlFormatter(string xmlfile, string outfile)
{
if (!File.Exists(xmlfile))
{
throw new FileNotFoundException($"文件 `{xmlfile}` 读取失败。", xmlfile);
}
// 1表示在标签中,2表示在文本中
int state = 0;
// 以下4个buffer分别表示:标签、文本、输出和前缀。
StringBuilder tagBuffer = new StringBuilder();
StringBuilder textBuffer = new StringBuilder();
StringBuilder prefix = new StringBuilder();
StringBuilder outBuffer = null; // new StringBuilder();
StreamReader sr = new StreamReader(new FileStream(xmlfile, FileMode.Open), Encoding.UTF8);
StreamWriter sw = new StreamWriter(outfile, false, Encoding.UTF8);
char[] buffer = new char[30]; // 每次读取1M的内容
while (!sr.EndOfStream)
{
int len = sr.ReadBlock(buffer, 0, buffer.Length);
for (int i = 0; i < len; i++)
{
char ch = buffer[i];
switch (ch)
{
case '<':
var text = textBuffer.Replace('\n', ' ').ToString().Trim();
textBuffer.Clear();
textBuffer.Append(text);
if (textBuffer.Length > 0)
{
outBuffer.Append(text);
}
// 避免在第一行加入的空行
else if (outBuffer == null)
{
outBuffer = new StringBuilder();
}
else
{
outBuffer.Append('\n');
}
state = 1;
tagBuffer.Clear();
tagBuffer.Append('<');
break;
case '>':
state = 2;
tagBuffer.Append('>');
switch (tagBuffer[1])
{
case '?':
outBuffer.Append(tagBuffer.ToString());
break;
case '/':
if (prefix.Length > 0)
prefix.Remove(0, 1);
outBuffer.Append((textBuffer.Length > 0 ? "" : prefix.ToString()) + tagBuffer.ToString());
textBuffer.Clear();
break;
default:
// 添加一个普通的开始标签
outBuffer.Append(prefix + tagBuffer.ToString());
// 如果是关闭的标签,则添加一个/t,以抵消后面的加入。
if (tagBuffer[tagBuffer.Length - 2] != '/')
prefix.Append('\t');
state = 2;
tagBuffer.Clear();
break;
}
break;
case '\r': break; // 不处理
default:
if (state == 1)
tagBuffer.Append(ch);
else if (state == 2)
textBuffer.Append(ch);
break;
}
}
sw.Write(outBuffer.ToString());
outBuffer.Clear();
}
sw.Close();
sr.Close();
return "";
}
}