对XML中的tag, attribute进行统计 郝伟 2021/01/05

1. 实现代码

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;

namespace XmlSchemaExtractor
{
    public class Attribute : XmlNode
    {
        public string value;

        public Attribute() { }
        public Attribute(string name, string value) { this.name = name; this.value = value; }
    }

    public class TextNode : XmlNode
    {
        public string text;
    }

    public class XmlNode
    {
        public string name;

        public XmlNode() { }
        public XmlNode(string name) { this.name = name; }

        /*
        <?xml version="1.0" encoding="utf-8">
        <cpe-list>
          <cpe-item name="cpe:/a:10-strike:network_monitor:1.0">
            <title xml:lang="en-US">10-Strike Network Monitor 1.0</title>
            <references>
              <reference href="https://www.10-strike.com/network-monitor/history.shtml">Version</reference>
              <reference href="https://www.10-strike.com/">Vendor</reference>
            </references>
            <meta:item-metadata nvd-id="552101" status="FINAL" modification-date="2019-09-06T17:18:29.197Z"/>
          </cpe-item>
        </cpe-list>
        */
        public static XmlNode ParseSchema(string xmlfile = @"F:\backups\official-cpe-dictionary_v2.2.xml")
        {
            // StreamReader sr = new StreamReader(xmlfile, Encoding.UTF8);
            // xmlfile = "sample.xml";
            var lines = File.ReadAllLines(xmlfile);
            lines[0] = "";
            Regex re_tag = new Regex("<([^<>]+)>");
            Regex re_att = new Regex("\\s([0-9a-zA-Z:._-]+)=\"([0-9a-zA-Z:/._-]+)\"");

            Dictionary<string, int> dic_tags = new Dictionary<string, int>();

            foreach (var line in lines)
            {
                foreach (Match m0 in re_tag.Matches(line))
                {
                    string part = m0.Groups[1].Value;
                    //Console.WriteLine("part = " + part);
                    // There are three cases:
                    // 1. tag name only, title
                    // 2. tag name with attributes, title xml:lang="en-US"
                    // 3. close tag, /title
                    if (part.StartsWith("/")) // case 3 
                    {
                    }
                    else if (part.Contains(" ")) // case 2 
                    {
                        foreach (Match m1 in re_att.Matches(part))
                        {
                            var key ="@" +  m1.Groups[1].Value;
                            DictAddKey(dic_tags, key);
                            DictAddKey(dic_tags, part.Split(' ')[0]);
                            // Console.WriteLine("key={0}\nval={1}\n", m1.Groups[1], m1.Groups[2]);
                        }
                    }
                    else // case 1 
                    {
                        DictAddKey(dic_tags, part); 
                    }
                }
            }

            foreach (var key in dic_tags.Keys.OrderBy(k => k))
            {
                Console.WriteLine(key + ": " + dic_tags[key]);
            }
            return null;
        }

        private static void DictAddKey(Dictionary<string, int> dic_tags, string key)
        {
            if (!dic_tags.ContainsKey(key))
                dic_tags.Add(key, 0);
            dic_tags[key] += 1;
        }

        static string[] exs = {
                "  <cpe-item",
                "    <title ",
                "    <references>",
                "      <reference ",
                "    </references>",
                "    <meta:",
                "  </cpe-item",
            };
        public static bool Contains(string str)
        {
            foreach (var ex in exs)
            {
                if (str.Contains(ex))
                {
                    return true;
                }
            }
            return false;
        }
    }
}

2. 统计结果

@deprecated: 22571
@deprecated_by: 15355
@deprecated-by-n: 1
@deprecated-by-nvd-id: 22568
@deprecation_date: 22569
@href: 744712
@modification-date: 579773
@name: 436348
@nvd-id: 579774
@status: 579773
@xml:lang: 585269
@xmlns: 1
@xmlns:config: 1
@xmlns:cpe-23: 1
@xmlns:meta: 1
@xmlns:ns6: 1
@xmlns:scap-core: 1
@xmlns:xsi: 1
cpe-item: 496843
cpe-list: 7
generator: 1
meta:item-metadata: 1761889
product_name: 1
product_version: 1
reference: 744712
references: 505300
root: 1
schema_version: 1
timestamp: 1
title: 585269

3. 其他

以下代码可以获得XML的schema

public void GetSchema()
{
    string xmlfile = @"C:\temp\sample.xml";
    xmlfile = @"F:\backups\official-cpe-dictionary_v2.2.xml";
    XmlReader reader = XmlReader.Create(xmlfile); 
    XmlSchemaSet schemaSet = new XmlSchemaSet();
    XmlSchemaInference inference = new XmlSchemaInference();
    schemaSet = inference.InferSchema(reader);

    // Display the inferred schema.
    Console.WriteLine("Original schema:\n");
    foreach (XmlSchema schema in schemaSet.Schemas())//  ("http://www.contoso.com/items"))
    {
        schema.Write(Console.Out);
    } 
}

运行结果

<?xml version="1.0" encoding="gb2312"?>
<xs:schema xmlns:ns6="http://scap.nist.gov/schema/scap-core/0.1" xmlns:config="http://scap.nist.gov/schema/configuration/0.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:scap-core="http://scap.nist.gov/schema/scap-core/0.3" xmlns:cpe-23="http://scap.nist.gov/schema/cpe-extension/2.3" xmlns:meta="http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2" attributeFormDefault="unqualified" elementFormDefault="qualified" targetNamespace="http://cpe.mitre.org/dictionary/2.0" xmlns:xs="http://www.w3.org/2001/XMLSchema">
  <xs:import namespace="http://www.w3.org/XML/1998/namespace" />
  <xs:import namespace="http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2" />
  <xs:element name="cpe-list">
    <xs:complexType>
      <xs:sequence>
        <xs:element name="generator">
          <xs:complexType>
            <xs:sequence>
              <xs:element name="product_name" type="xs:string" />
              <xs:element name="product_version" type="xs:decimal" />
              <xs:element name="schema_version" type="xs:decimal" />
              <xs:element name="timestamp" type="xs:dateTime" />
            </xs:sequence>
          </xs:complexType>
        </xs:element>
        <xs:element maxOccurs="unbounded" name="cpe-item">
          <xs:complexType>
            <xs:sequence>
              <xs:element maxOccurs="unbounded" name="title">
                <xs:complexType>
                  <xs:simpleContent>
                    <xs:extension base="xs:string">
                      <xs:attribute ref="xml:lang" use="required" />
                    </xs:extension>
                  </xs:simpleContent>
                </xs:complexType>
              </xs:element>
              <xs:element minOccurs="0" name="references">
                <xs:complexType>
                  <xs:sequence>
                    <xs:element maxOccurs="unbounded" name="reference">
                      <xs:complexType>
                        <xs:simpleContent>
                          <xs:extension base="xs:string">
                            <xs:attribute name="href" type="xs:string" use="required" />
                          </xs:extension>
                        </xs:simpleContent>
                      </xs:complexType>
                    </xs:element>
                  </xs:sequence>
                </xs:complexType>
              </xs:element>
              <xs:element ref="meta:item-metadata" />
            </xs:sequence>
            <xs:attribute name="name" type="xs:string" use="required" />
            <xs:attribute name="deprecated" type="xs:boolean" use="optional" />
            <xs:attribute name="deprecated_by" type="xs:string" use="optional" />
            <xs:attribute name="deprecation_date" type="xs:dateTime" use="optional" />
          </xs:complexType>
        </xs:element>
      </xs:sequence>
    </xs:complexType>
  </xs:element>
</xs:schema><?xml version="1.0" encoding="gb2312"?>
<xs:schema xmlns:tns="http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2" attributeFormDefault="unqualified" elementFormDefault="qualified" targetNamespace="http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2" xmlns:xs="http://www.w3.org/2001/XMLSchema">
  <xs:element name="item-metadata">
    <xs:complexType>
      <xs:attribute name="nvd-id" type="xs:unsignedInt" use="required" />
      <xs:attribute name="status" type="xs:string" use="optional" />
      <xs:attribute name="modification-date" type="xs:dateTime" use="optional" />
      <xs:attribute name="deprecated-by-nvd-id" type="xs:unsignedInt" use="optional" />
      <xs:attribute name="deprecated-by-n" type="xs:unsignedByte" use="optional" />
    </xs:complexType>
  </xs:element>
</xs:schema><?xml version="1.0" encoding="gb2312"?>
<xs:schema xmlns:xml="http://www.w3.org/XML/1998/namespace" targetNamespace="http://www.w3.org/XML/1998/namespace" xmlns:xs="http://www.w3.org/2001/XMLSchema">
  <xs:attribute name="lang" type="xs:language" />
  <xs:attribute name="base" type="xs:anyURI" />
  <xs:attribute default="preserve" name="space">
    <xs:simpleType>
      <xs:restriction base="xs:NCName">
        <xs:enumeration value="default" />
        <xs:enumeration value="preserve" />
      </xs:restriction>
    </xs:simpleType>
  </xs:attribute>
  <xs:attributeGroup name="specialAttrs">
    <xs:attribute ref="xml:lang" />
    <xs:attribute ref="xml:space" />
    <xs:attribute ref="xml:base" />
  </xs:attributeGroup>

results matching ""

    No results matching ""