对XML中的tag, attribute进行统计
郝伟 2021/01/05
using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; namespace XmlSchemaExtractor { public class Attribute : XmlNode { public string value; public Attribute() { } public Attribute(string name, string value) { this.name = name; this.value = value; } } public class TextNode : XmlNode { public string text; } public class XmlNode { public string name; public XmlNode() { } public XmlNode(string name) { this.name = name; } /* <?xml version="1.0" encoding="utf-8"> <cpe-list> <cpe-item name="cpe:/a:10-strike:network_monitor:1.0"> <title xml:lang="en-US">10-Strike Network Monitor 1.0</title> <references> <reference href="https://www.10-strike.com/network-monitor/history.shtml">Version</reference> <reference href="https://www.10-strike.com/">Vendor</reference> </references> <meta:item-metadata nvd-id="552101" status="FINAL" modification-date="2019-09-06T17:18:29.197Z"/> </cpe-item> </cpe-list> */ public static XmlNode ParseSchema(string xmlfile = @"F:\backups\official-cpe-dictionary_v2.2.xml") { // StreamReader sr = new StreamReader(xmlfile, Encoding.UTF8); // xmlfile = "sample.xml"; var lines = File.ReadAllLines(xmlfile); lines[0] = ""; Regex re_tag = new Regex("<([^<>]+)>"); Regex re_att = new Regex("\\s([0-9a-zA-Z:._-]+)=\"([0-9a-zA-Z:/._-]+)\""); Dictionary<string, int> dic_tags = new Dictionary<string, int>(); foreach (var line in lines) { foreach (Match m0 in re_tag.Matches(line)) { string part = m0.Groups[1].Value; //Console.WriteLine("part = " + part); // There are three cases: // 1. tag name only, title // 2. tag name with attributes, title xml:lang="en-US" // 3. close tag, /title if (part.StartsWith("/")) // case 3 { } else if (part.Contains(" ")) // case 2 { foreach (Match m1 in re_att.Matches(part)) { var key ="@" + m1.Groups[1].Value; DictAddKey(dic_tags, key); DictAddKey(dic_tags, part.Split(' ')[0]); // Console.WriteLine("key={0}\nval={1}\n", m1.Groups[1], m1.Groups[2]); } } else // case 1 { DictAddKey(dic_tags, part); } } } foreach (var key in dic_tags.Keys.OrderBy(k => k)) { Console.WriteLine(key + ": " + dic_tags[key]); } return null; } private static void DictAddKey(Dictionary<string, int> dic_tags, string key) { if (!dic_tags.ContainsKey(key)) dic_tags.Add(key, 0); dic_tags[key] += 1; } static string[] exs = { " <cpe-item", " <title ", " <references>", " <reference ", " </references>", " <meta:", " </cpe-item", }; public static bool Contains(string str) { foreach (var ex in exs) { if (str.Contains(ex)) { return true; } } return false; } } }
@deprecated: 22571
@deprecated_by: 15355
@deprecated-by-n: 1
@deprecated-by-nvd-id: 22568
@deprecation_date: 22569
@href: 744712
@modification-date: 579773
@name: 436348
@nvd-id: 579774
@status: 579773
@xml:lang: 585269
@xmlns: 1
@xmlns:config: 1
@xmlns:cpe-23: 1
@xmlns:meta: 1
@xmlns:ns6: 1
@xmlns:scap-core: 1
@xmlns:xsi: 1
cpe-item: 496843
cpe-list: 7
generator: 1
meta:item-metadata: 1761889
product_name: 1
product_version: 1
reference: 744712
references: 505300
root: 1
schema_version: 1
timestamp: 1
title: 585269
以下代码可以获得XML的schema
public void GetSchema() { string xmlfile = @"C:\temp\sample.xml"; xmlfile = @"F:\backups\official-cpe-dictionary_v2.2.xml"; XmlReader reader = XmlReader.Create(xmlfile); XmlSchemaSet schemaSet = new XmlSchemaSet(); XmlSchemaInference inference = new XmlSchemaInference(); schemaSet = inference.InferSchema(reader); // Display the inferred schema. Console.WriteLine("Original schema:\n"); foreach (XmlSchema schema in schemaSet.Schemas())// ("http://www.contoso.com/items")) { schema.Write(Console.Out); } }
运行结果
<?xml version="1.0" encoding="gb2312"?>
<xs:schema xmlns:ns6="http://scap.nist.gov/schema/scap-core/0.1" xmlns:config="http://scap.nist.gov/schema/configuration/0.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:scap-core="http://scap.nist.gov/schema/scap-core/0.3" xmlns:cpe-23="http://scap.nist.gov/schema/cpe-extension/2.3" xmlns:meta="http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2" attributeFormDefault="unqualified" elementFormDefault="qualified" targetNamespace="http://cpe.mitre.org/dictionary/2.0" xmlns:xs="http://www.w3.org/2001/XMLSchema">
<xs:import namespace="http://www.w3.org/XML/1998/namespace" />
<xs:import namespace="http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2" />
<xs:element name="cpe-list">
<xs:complexType>
<xs:sequence>
<xs:element name="generator">
<xs:complexType>
<xs:sequence>
<xs:element name="product_name" type="xs:string" />
<xs:element name="product_version" type="xs:decimal" />
<xs:element name="schema_version" type="xs:decimal" />
<xs:element name="timestamp" type="xs:dateTime" />
</xs:sequence>
</xs:complexType>
</xs:element>
<xs:element maxOccurs="unbounded" name="cpe-item">
<xs:complexType>
<xs:sequence>
<xs:element maxOccurs="unbounded" name="title">
<xs:complexType>
<xs:simpleContent>
<xs:extension base="xs:string">
<xs:attribute ref="xml:lang" use="required" />
</xs:extension>
</xs:simpleContent>
</xs:complexType>
</xs:element>
<xs:element minOccurs="0" name="references">
<xs:complexType>
<xs:sequence>
<xs:element maxOccurs="unbounded" name="reference">
<xs:complexType>
<xs:simpleContent>
<xs:extension base="xs:string">
<xs:attribute name="href" type="xs:string" use="required" />
</xs:extension>
</xs:simpleContent>
</xs:complexType>
</xs:element>
</xs:sequence>
</xs:complexType>
</xs:element>
<xs:element ref="meta:item-metadata" />
</xs:sequence>
<xs:attribute name="name" type="xs:string" use="required" />
<xs:attribute name="deprecated" type="xs:boolean" use="optional" />
<xs:attribute name="deprecated_by" type="xs:string" use="optional" />
<xs:attribute name="deprecation_date" type="xs:dateTime" use="optional" />
</xs:complexType>
</xs:element>
</xs:sequence>
</xs:complexType>
</xs:element>
</xs:schema><?xml version="1.0" encoding="gb2312"?>
<xs:schema xmlns:tns="http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2" attributeFormDefault="unqualified" elementFormDefault="qualified" targetNamespace="http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2" xmlns:xs="http://www.w3.org/2001/XMLSchema">
<xs:element name="item-metadata">
<xs:complexType>
<xs:attribute name="nvd-id" type="xs:unsignedInt" use="required" />
<xs:attribute name="status" type="xs:string" use="optional" />
<xs:attribute name="modification-date" type="xs:dateTime" use="optional" />
<xs:attribute name="deprecated-by-nvd-id" type="xs:unsignedInt" use="optional" />
<xs:attribute name="deprecated-by-n" type="xs:unsignedByte" use="optional" />
</xs:complexType>
</xs:element>
</xs:schema><?xml version="1.0" encoding="gb2312"?>
<xs:schema xmlns:xml="http://www.w3.org/XML/1998/namespace" targetNamespace="http://www.w3.org/XML/1998/namespace" xmlns:xs="http://www.w3.org/2001/XMLSchema">
<xs:attribute name="lang" type="xs:language" />
<xs:attribute name="base" type="xs:anyURI" />
<xs:attribute default="preserve" name="space">
<xs:simpleType>
<xs:restriction base="xs:NCName">
<xs:enumeration value="default" />
<xs:enumeration value="preserve" />
</xs:restriction>
</xs:simpleType>
</xs:attribute>
<xs:attributeGroup name="specialAttrs">
<xs:attribute ref="xml:lang" />
<xs:attribute ref="xml:space" />
<xs:attribute ref="xml:base" />
</xs:attributeGroup>