对XML中的tag, attribute进行统计 郝伟 2021/01/05
1. 实现代码
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace XmlSchemaExtractor
{
public class Attribute : XmlNode
{
public string value;
public Attribute() { }
public Attribute(string name, string value) { this.name = name; this.value = value; }
}
public class TextNode : XmlNode
{
public string text;
}
public class XmlNode
{
public string name;
public XmlNode() { }
public XmlNode(string name) { this.name = name; }
/*
<?xml version="1.0" encoding="utf-8">
<cpe-list>
<cpe-item name="cpe:/a:10-strike:network_monitor:1.0">
<title xml:lang="en-US">10-Strike Network Monitor 1.0</title>
<references>
<reference href="https://www.10-strike.com/network-monitor/history.shtml">Version</reference>
<reference href="https://www.10-strike.com/">Vendor</reference>
</references>
<meta:item-metadata nvd-id="552101" status="FINAL" modification-date="2019-09-06T17:18:29.197Z"/>
</cpe-item>
</cpe-list>
*/
public static XmlNode ParseSchema(string xmlfile = @"F:\backups\official-cpe-dictionary_v2.2.xml")
{
// StreamReader sr = new StreamReader(xmlfile, Encoding.UTF8);
// xmlfile = "sample.xml";
var lines = File.ReadAllLines(xmlfile);
lines[0] = "";
Regex re_tag = new Regex("<([^<>]+)>");
Regex re_att = new Regex("\\s([0-9a-zA-Z:._-]+)=\"([0-9a-zA-Z:/._-]+)\"");
Dictionary<string, int> dic_tags = new Dictionary<string, int>();
foreach (var line in lines)
{
foreach (Match m0 in re_tag.Matches(line))
{
string part = m0.Groups[1].Value;
//Console.WriteLine("part = " + part);
// There are three cases:
// 1. tag name only, title
// 2. tag name with attributes, title xml:lang="en-US"
// 3. close tag, /title
if (part.StartsWith("/")) // case 3
{
}
else if (part.Contains(" ")) // case 2
{
foreach (Match m1 in re_att.Matches(part))
{
var key ="@" + m1.Groups[1].Value;
DictAddKey(dic_tags, key);
DictAddKey(dic_tags, part.Split(' ')[0]);
// Console.WriteLine("key={0}\nval={1}\n", m1.Groups[1], m1.Groups[2]);
}
}
else // case 1
{
DictAddKey(dic_tags, part);
}
}
}
foreach (var key in dic_tags.Keys.OrderBy(k => k))
{
Console.WriteLine(key + ": " + dic_tags[key]);
}
return null;
}
private static void DictAddKey(Dictionary<string, int> dic_tags, string key)
{
if (!dic_tags.ContainsKey(key))
dic_tags.Add(key, 0);
dic_tags[key] += 1;
}
static string[] exs = {
" <cpe-item",
" <title ",
" <references>",
" <reference ",
" </references>",
" <meta:",
" </cpe-item",
};
public static bool Contains(string str)
{
foreach (var ex in exs)
{
if (str.Contains(ex))
{
return true;
}
}
return false;
}
}
}
2. 统计结果
@deprecated: 22571
@deprecated_by: 15355
@deprecated-by-n: 1
@deprecated-by-nvd-id: 22568
@deprecation_date: 22569
@href: 744712
@modification-date: 579773
@name: 436348
@nvd-id: 579774
@status: 579773
@xml:lang: 585269
@xmlns: 1
@xmlns:config: 1
@xmlns:cpe-23: 1
@xmlns:meta: 1
@xmlns:ns6: 1
@xmlns:scap-core: 1
@xmlns:xsi: 1
cpe-item: 496843
cpe-list: 7
generator: 1
meta:item-metadata: 1761889
product_name: 1
product_version: 1
reference: 744712
references: 505300
root: 1
schema_version: 1
timestamp: 1
title: 585269
3. 其他
以下代码可以获得XML的schema
public void GetSchema()
{
string xmlfile = @"C:\temp\sample.xml";
xmlfile = @"F:\backups\official-cpe-dictionary_v2.2.xml";
XmlReader reader = XmlReader.Create(xmlfile);
XmlSchemaSet schemaSet = new XmlSchemaSet();
XmlSchemaInference inference = new XmlSchemaInference();
schemaSet = inference.InferSchema(reader);
// Display the inferred schema.
Console.WriteLine("Original schema:\n");
foreach (XmlSchema schema in schemaSet.Schemas())// ("http://www.contoso.com/items"))
{
schema.Write(Console.Out);
}
}
运行结果
<?xml version="1.0" encoding="gb2312"?>
<xs:schema xmlns:ns6="http://scap.nist.gov/schema/scap-core/0.1" xmlns:config="http://scap.nist.gov/schema/configuration/0.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:scap-core="http://scap.nist.gov/schema/scap-core/0.3" xmlns:cpe-23="http://scap.nist.gov/schema/cpe-extension/2.3" xmlns:meta="http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2" attributeFormDefault="unqualified" elementFormDefault="qualified" targetNamespace="http://cpe.mitre.org/dictionary/2.0" xmlns:xs="http://www.w3.org/2001/XMLSchema">
<xs:import namespace="http://www.w3.org/XML/1998/namespace" />
<xs:import namespace="http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2" />
<xs:element name="cpe-list">
<xs:complexType>
<xs:sequence>
<xs:element name="generator">
<xs:complexType>
<xs:sequence>
<xs:element name="product_name" type="xs:string" />
<xs:element name="product_version" type="xs:decimal" />
<xs:element name="schema_version" type="xs:decimal" />
<xs:element name="timestamp" type="xs:dateTime" />
</xs:sequence>
</xs:complexType>
</xs:element>
<xs:element maxOccurs="unbounded" name="cpe-item">
<xs:complexType>
<xs:sequence>
<xs:element maxOccurs="unbounded" name="title">
<xs:complexType>
<xs:simpleContent>
<xs:extension base="xs:string">
<xs:attribute ref="xml:lang" use="required" />
</xs:extension>
</xs:simpleContent>
</xs:complexType>
</xs:element>
<xs:element minOccurs="0" name="references">
<xs:complexType>
<xs:sequence>
<xs:element maxOccurs="unbounded" name="reference">
<xs:complexType>
<xs:simpleContent>
<xs:extension base="xs:string">
<xs:attribute name="href" type="xs:string" use="required" />
</xs:extension>
</xs:simpleContent>
</xs:complexType>
</xs:element>
</xs:sequence>
</xs:complexType>
</xs:element>
<xs:element ref="meta:item-metadata" />
</xs:sequence>
<xs:attribute name="name" type="xs:string" use="required" />
<xs:attribute name="deprecated" type="xs:boolean" use="optional" />
<xs:attribute name="deprecated_by" type="xs:string" use="optional" />
<xs:attribute name="deprecation_date" type="xs:dateTime" use="optional" />
</xs:complexType>
</xs:element>
</xs:sequence>
</xs:complexType>
</xs:element>
</xs:schema><?xml version="1.0" encoding="gb2312"?>
<xs:schema xmlns:tns="http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2" attributeFormDefault="unqualified" elementFormDefault="qualified" targetNamespace="http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2" xmlns:xs="http://www.w3.org/2001/XMLSchema">
<xs:element name="item-metadata">
<xs:complexType>
<xs:attribute name="nvd-id" type="xs:unsignedInt" use="required" />
<xs:attribute name="status" type="xs:string" use="optional" />
<xs:attribute name="modification-date" type="xs:dateTime" use="optional" />
<xs:attribute name="deprecated-by-nvd-id" type="xs:unsignedInt" use="optional" />
<xs:attribute name="deprecated-by-n" type="xs:unsignedByte" use="optional" />
</xs:complexType>
</xs:element>
</xs:schema><?xml version="1.0" encoding="gb2312"?>
<xs:schema xmlns:xml="http://www.w3.org/XML/1998/namespace" targetNamespace="http://www.w3.org/XML/1998/namespace" xmlns:xs="http://www.w3.org/2001/XMLSchema">
<xs:attribute name="lang" type="xs:language" />
<xs:attribute name="base" type="xs:anyURI" />
<xs:attribute default="preserve" name="space">
<xs:simpleType>
<xs:restriction base="xs:NCName">
<xs:enumeration value="default" />
<xs:enumeration value="preserve" />
</xs:restriction>
</xs:simpleType>
</xs:attribute>
<xs:attributeGroup name="specialAttrs">
<xs:attribute ref="xml:lang" />
<xs:attribute ref="xml:space" />
<xs:attribute ref="xml:base" />
</xs:attributeGroup>