repoman: check metadata.xml xml decl, bug #328113 v2.2.0_alpha165
authorZac Medico <zmedico@gentoo.org>
Sun, 3 Mar 2013 17:59:21 +0000 (09:59 -0800)
committerZac Medico <zmedico@gentoo.org>
Sun, 3 Mar 2013 17:59:21 +0000 (09:59 -0800)
bin/repoman

index 5618cf198ed6179771d072f528861b6f944533cc..8c49c06e5d22261a134e08a60f5775f67420967a 100755 (executable)
@@ -508,6 +508,9 @@ suspect_virtual = {
        "dev-libs/libusb-compat":"virtual/libusb",
 }
 
+metadata_xml_encoding = 'UTF-8'
+metadata_xml_declaration = '<?xml version="1.0" encoding="%s"?>' % \
+       (metadata_xml_encoding,)
 metadata_doctype_name = 'pkgmetadata'
 metadata_dtd_uri = 'http://www.gentoo.org/dtd/metadata.dtd'
 # force refetch if the local copy creation time is older than this
@@ -1274,17 +1277,38 @@ for k, v in repoman_settings.thirdpartymirrors().items():
                        v += "/"
                thirdpartymirrors[v] = k
 
+class _XMLParser(xml.etree.ElementTree.XMLParser):
+
+       def __init__(self, data, **kwargs):
+               xml.etree.ElementTree.XMLParser.__init__(self, **kwargs)
+               self._portage_data = data
+               if hasattr(self, 'parser'):
+                       self._base_XmlDeclHandler = self.parser.XmlDeclHandler
+                       self.parser.XmlDeclHandler = self._portage_XmlDeclHandler
+                       self._base_StartDoctypeDeclHandler = \
+                               self.parser.StartDoctypeDeclHandler
+                       self.parser.StartDoctypeDeclHandler = \
+                               self._portage_StartDoctypeDeclHandler
+
+       def _portage_XmlDeclHandler(self, version, encoding, standalone):
+               if self._base_XmlDeclHandler is not None:
+                       self._base_XmlDeclHandler(version, encoding, standalone)
+               self._portage_data["XML_DECLARATION"] = (version, encoding, standalone)
+
+       def _portage_StartDoctypeDeclHandler(self, doctypeName, systemId, publicId,
+               has_internal_subset):
+               if self._base_StartDoctypeDeclHandler is not None:
+                       self._base_StartDoctypeDeclHandler(doctypeName, systemId, publicId,
+                               has_internal_subset)
+               self._portage_data["DOCTYPE"] = (doctypeName, systemId, publicId)
+
 class _MetadataTreeBuilder(xml.etree.ElementTree.TreeBuilder):
        """
        Implements doctype() as required to avoid deprecation warnings with
        >=python-2.7.
        """
-       def __init__(self, data):
-               xml.etree.ElementTree.TreeBuilder.__init__(self)
-               self._portage_data = data
-
        def doctype(self, name, pubid, system):
-               self._portage_data["DOCTYPE"] = (name, pubid, system)
+               pass
 
 try:
        herd_base = make_herd_base(os.path.join(repoman_settings["PORTDIR"], "metadata/herds.xml"))
@@ -1644,43 +1668,68 @@ for x in effective_scanlist:
        else:
                metadata_bad = False
                xml_info = {}
+               xml_parser = _XMLParser(xml_info, target=_MetadataTreeBuilder())
 
                # read metadata.xml into memory
                try:
                        _metadata_xml = xml.etree.ElementTree.parse(
                                _unicode_encode(os.path.join(checkdir, "metadata.xml"),
                                encoding=_encodings['fs'], errors='strict'),
-                               parser=xml.etree.ElementTree.XMLParser(
-                                       target=_MetadataTreeBuilder(xml_info)))
+                               parser=xml_parser)
                except (ExpatError, SyntaxError, EnvironmentError) as e:
                        metadata_bad = True
                        stats["metadata.bad"] += 1
                        fails["metadata.bad"].append("%s/metadata.xml: %s" % (x, e))
                        del e
                else:
-                       if sys.hexversion < 0x2070000 or \
+                       if not hasattr(xml_parser, 'parser') or \
+                               sys.hexversion < 0x2070000 or \
                                (sys.hexversion > 0x3000000 and sys.hexversion < 0x3020000):
                                # doctype is not parsed with python 2.6 or 3.1
                                pass
-                       elif "DOCTYPE" not in xml_info:
-                               metadata_bad = True
-                               stats["metadata.bad"] += 1
-                               fails["metadata.bad"].append("%s/metadata.xml: %s" % (x,
-                                       "DOCTYPE is missing"))
                        else:
-                               doctype_name, doctype_pubid, doctype_system = \
-                                       xml_info["DOCTYPE"]
-                               if doctype_system != metadata_dtd_uri:
+                               if "XML_DECLARATION" not in xml_info:
                                        stats["metadata.bad"] += 1
                                        fails["metadata.bad"].append("%s/metadata.xml: "
-                                               "DOCTYPE: SYSTEM should refer to '%s', not '%s'" %
-                                               (x, metadata_dtd_uri, doctype_system))
+                                               "xml declaration is missing on first line, "
+                                               "should be '%s'" % (x, metadata_xml_declaration))
+                               else:
+                                       xml_version, xml_encoding, xml_standalone = \
+                                               xml_info["XML_DECLARATION"]
+                                       if xml_encoding is None or \
+                                               xml_encoding.upper() != metadata_xml_encoding:
+                                               stats["metadata.bad"] += 1
+                                               if xml_encoding is None:
+                                                       encoding_problem = "but it is undefined"
+                                               else:
+                                                       encoding_problem = "not '%s'" % xml_encoding
+                                               fails["metadata.bad"].append("%s/metadata.xml: "
+                                                       "xml declaration encoding should be '%s', %s" %
+                                                       (x, metadata_xml_encoding, encoding_problem))
 
-                               if doctype_name != metadata_doctype_name:
+                               if "DOCTYPE" not in xml_info:
+                                       metadata_bad = True
                                        stats["metadata.bad"] += 1
-                                       fails["metadata.bad"].append("%s/metadata.xml: "
-                                               "DOCTYPE: name should be '%s', not '%s'" %
-                                               (x, metadata_doctype_name, doctype_name))
+                                       fails["metadata.bad"].append("%s/metadata.xml: %s" % (x,
+                                               "DOCTYPE is missing"))
+                               else:
+                                       doctype_name, doctype_system, doctype_pubid = \
+                                               xml_info["DOCTYPE"]
+                                       if doctype_system != metadata_dtd_uri:
+                                               stats["metadata.bad"] += 1
+                                               if doctype_system is None:
+                                                       system_problem = "but it is undefined"
+                                               else:
+                                                       system_problem = "not '%s'" % doctype_system
+                                               fails["metadata.bad"].append("%s/metadata.xml: "
+                                                       "DOCTYPE: SYSTEM should refer to '%s', %s" %
+                                                       (x, metadata_dtd_uri, system_problem))
+
+                                       if doctype_name != metadata_doctype_name:
+                                               stats["metadata.bad"] += 1
+                                               fails["metadata.bad"].append("%s/metadata.xml: "
+                                                       "DOCTYPE: name should be '%s', not '%s'" %
+                                                       (x, metadata_doctype_name, doctype_name))
 
                        # load USE flags from metadata.xml
                        try: