[or-cvs] r18284: {torflow} Update BeautifulSoup to 3.0.7a. We're going to stay away fro (torflow/trunk/NetworkScanners/libs/BeautifulSoup)

mikeperry at seul.org mikeperry at seul.org
Wed Jan 28 04:17:46 UTC 2009


Author: mikeperry
Date: 2009-01-27 23:17:46 -0500 (Tue, 27 Jan 2009)
New Revision: 18284

Modified:
   torflow/trunk/NetworkScanners/libs/BeautifulSoup/BeautifulSoup.py
Log:

Update BeautifulSoup to 3.0.7a. We're going to stay away from
3.1 for now since it is worse at parsing bad html and has
some compatibility issues.



Modified: torflow/trunk/NetworkScanners/libs/BeautifulSoup/BeautifulSoup.py
===================================================================
--- torflow/trunk/NetworkScanners/libs/BeautifulSoup/BeautifulSoup.py	2009-01-28 00:03:47 UTC (rev 18283)
+++ torflow/trunk/NetworkScanners/libs/BeautifulSoup/BeautifulSoup.py	2009-01-28 04:17:46 UTC (rev 18284)
@@ -42,7 +42,7 @@
 
 Here, have some legalese:
 
-Copyright (c) 2004-2007, Leonard Richardson
+Copyright (c) 2004-2008, Leonard Richardson
 
 All rights reserved.
 
@@ -79,12 +79,13 @@
 from __future__ import generators
 
 __author__ = "Leonard Richardson (leonardr at segfault.org)"
-__version__ = "3.0.5"
-__copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
+__version__ = "3.0.7a"
+__copyright__ = "Copyright (c) 2004-2008 Leonard Richardson"
 __license__ = "New-style BSD"
 
 from sgmllib import SGMLParser, SGMLParseError
 import codecs
+import markupbase
 import types
 import re
 import sgmllib
@@ -92,9 +93,14 @@
   from htmlentitydefs import name2codepoint
 except ImportError:
   name2codepoint = {}
+try:
+    set
+except NameError:
+    from sets import Set as set
 
-#This hack makes Beautiful Soup able to parse XML with namespaces
+#These hacks make Beautiful Soup able to parse XML with namespaces
 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
+markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
 
 DEFAULT_OUTPUT_ENCODING = "utf-8"
 
@@ -157,6 +163,7 @@
         if self.nextSibling:
             self.nextSibling.previousSibling = self.previousSibling
         self.previousSibling = self.nextSibling = None
+        return self
 
     def _lastRecursiveChild(self):
         "Finds the last element beneath this object to be parsed."
@@ -237,8 +244,9 @@
     def findAllNext(self, name=None, attrs={}, text=None, limit=None,
                     **kwargs):
         """Returns all items that match the given criteria and appear
-        before after Tag in the document."""
-        return self._findAll(name, attrs, text, limit, self.nextGenerator)
+        after this Tag in the document."""
+        return self._findAll(name, attrs, text, limit, self.nextGenerator,
+                             **kwargs)
 
     def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
         """Returns the closest sibling to this Tag that matches the
@@ -389,6 +397,18 @@
 
 class NavigableString(unicode, PageElement):
 
+    def __new__(cls, value):
+        """Create a new NavigableString.
+
+        When unpickling a NavigableString, this method is called with
+        the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
+        passed in to the superclass's __new__ or the superclass won't know
+        how to handle non-ASCII characters.
+        """
+        if isinstance(value, unicode):
+            return unicode.__new__(cls, value)
+        return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+
     def __getnewargs__(self):
         return (NavigableString.__str__(self),)
 
@@ -402,7 +422,7 @@
             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
 
     def __unicode__(self):
-        return unicode(str(self))
+        return str(self).decode(DEFAULT_OUTPUT_ENCODING)
 
     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         if encoding:
@@ -688,6 +708,16 @@
             s = ''.join(s)
         return s
 
+    def decompose(self):
+        """Recursively destroys the contents of this tree."""
+        contents = [i for i in self.contents]
+        for i in contents:
+            if isinstance(i, Tag):
+                i.decompose()
+            else:
+                i.extract()
+        self.extract()
+
     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
         return self.__str__(encoding, True)
 
@@ -970,6 +1000,7 @@
     NESTABLE_TAGS = {}
     RESET_NESTING_TAGS = {}
     QUOTE_TAGS = {}
+    PRESERVE_WHITESPACE_TAGS = []
 
     MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
                        lambda x: x.group(1) + ' />'),
@@ -993,7 +1024,7 @@
 
     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
-                 convertEntities=None, selfClosingTags=None):
+                 convertEntities=None, selfClosingTags=None, isHTML=False):
         """The Soup object is initialized as the 'root tag', and the
         provided markup (which can be a string or a file-like object)
         is fed into the underlying parser.
@@ -1055,7 +1086,7 @@
         self.markup = markup
         self.markupMassage = markupMassage
         try:
-            self._feed()
+            self._feed(isHTML=isHTML)
         except StopParsing:
             pass
         self.markup = None                 # The markup can now be GCed
@@ -1070,7 +1101,7 @@
             return
         return self.convert_codepoint(n)
 
-    def _feed(self, inDocumentEncoding=None):
+    def _feed(self, inDocumentEncoding=None, isHTML=False):
         # Convert the document to Unicode.
         markup = self.markup
         if isinstance(markup, unicode):
@@ -1079,9 +1110,10 @@
         else:
             dammit = UnicodeDammit\
                      (markup, [self.fromEncoding, inDocumentEncoding],
-                      smartQuotesTo=self.smartQuotesTo)
+                      smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
             markup = dammit.unicode
             self.originalEncoding = dammit.originalEncoding
+            self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
         if markup:
             if self.markupMassage:
                 if not isList(self.markupMassage):
@@ -1154,8 +1186,10 @@
 
     def endData(self, containerClass=NavigableString):
         if self.currentData:
-            currentData = ''.join(self.currentData)
-            if not currentData.translate(self.STRIP_ASCII_SPACES):
+            currentData = u''.join(self.currentData)
+            if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
+                not set([tag.name for tag in self.tagStack]).intersection(
+                    self.PRESERVE_WHITESPACE_TAGS)):
                 if '\n' in currentData:
                     currentData = '\n'
                 else:
@@ -1432,12 +1466,15 @@
     def __init__(self, *args, **kwargs):
         if not kwargs.has_key('smartQuotesTo'):
             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
+        kwargs['isHTML'] = True
         BeautifulStoneSoup.__init__(self, *args, **kwargs)
 
     SELF_CLOSING_TAGS = buildTagMap(None,
                                     ['br' , 'hr', 'input', 'img', 'meta',
                                     'spacer', 'link', 'frame', 'base'])
 
+    PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
+
     QUOTE_TAGS = {'script' : None, 'textarea' : None}
 
     #According to the HTML standard, each of these inline tags can
@@ -1482,7 +1519,7 @@
                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
 
     # Used to detect the charset in a META tag; see start_meta
-    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
+    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
 
     def start_meta(self, attrs):
         """Beautiful Soup can detect a charset included in a META tag,
@@ -1505,25 +1542,28 @@
         if httpEquiv and contentType: # It's an interesting meta tag.
             match = self.CHARSET_RE.search(contentType)
             if match:
-                if getattr(self, 'declaredHTMLEncoding') or \
-                       (self.originalEncoding == self.fromEncoding):
-                    # This is our second pass through the document, or
-                    # else an encoding was specified explicitly and it
-                    # worked. Rewrite the meta tag.
-                    newAttr = self.CHARSET_RE.sub\
-                              (lambda(match):match.group(1) +
-                               "%SOUP-ENCODING%", value)
+                if (self.declaredHTMLEncoding is not None or
+                    self.originalEncoding == self.fromEncoding):
+                    # An HTML encoding was sniffed while converting
+                    # the document to Unicode, or an HTML encoding was
+                    # sniffed during a previous pass through the
+                    # document, or an encoding was specified
+                    # explicitly and it worked. Rewrite the meta tag.
+                    def rewrite(match):
+                        return match.group(1) + "%SOUP-ENCODING%"
+                    newAttr = self.CHARSET_RE.sub(rewrite, contentType)
                     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
                                                newAttr)
                     tagNeedsEncodingSubstitution = True
                 else:
                     # This is our first pass through the document.
-                    # Go through it again with the new information.
+                    # Go through it again with the encoding information.
                     newCharset = match.group(3)
                     if newCharset and newCharset != self.originalEncoding:
                         self.declaredHTMLEncoding = newCharset
                         self._feed(self.declaredHTMLEncoding)
                         raise StopParsing
+                    pass
         tag = self.unknown_starttag("meta", attrs)
         if tag and tagNeedsEncodingSubstitution:
             tag.containsSubstitutions = True
@@ -1646,20 +1686,19 @@
     import chardet
 #    import chardet.constants
 #    chardet.constants._debug = 1
-except:
+except ImportError:
     chardet = None
-chardet = None
 
 # cjkcodecs and iconv_codec make Python know about more character encodings.
 # Both are available from http://cjkpython.i18n.org/
 # They're built in if you use Python 2.4.
 try:
     import cjkcodecs.aliases
-except:
+except ImportError:
     pass
 try:
     import iconv_codec
-except:
+except ImportError:
     pass
 
 class UnicodeDammit:
@@ -1676,9 +1715,10 @@
                         "x-sjis" : "shift-jis" }
 
     def __init__(self, markup, overrideEncodings=[],
-                 smartQuotesTo='xml'):
+                 smartQuotesTo='xml', isHTML=False):
+        self.declaredHTMLEncoding = None
         self.markup, documentEncoding, sniffedEncoding = \
-                     self._detectEncoding(markup)
+                     self._detectEncoding(markup, isHTML)
         self.smartQuotesTo = smartQuotesTo
         self.triedEncodings = []
         if markup == '' or isinstance(markup, unicode):
@@ -1704,6 +1744,7 @@
             for proposed_encoding in ("utf-8", "windows-1252"):
                 u = self._convertFrom(proposed_encoding)
                 if u: break
+
         self.unicode = u
         if not u: self.originalEncoding = None
 
@@ -1771,7 +1812,7 @@
         newdata = unicode(data, encoding)
         return newdata
 
-    def _detectEncoding(self, xml_data):
+    def _detectEncoding(self, xml_data, isHTML=False):
         """Given a document, tries to detect its XML encoding."""
         xml_encoding = sniffed_xml_encoding = None
         try:
@@ -1819,13 +1860,17 @@
             else:
                 sniffed_xml_encoding = 'ascii'
                 pass
-            xml_encoding_match = re.compile \
-                                 ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
-                                 .match(xml_data)
         except:
             xml_encoding_match = None
-        if xml_encoding_match:
+        xml_encoding_match = re.compile(
+            '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
+        if not xml_encoding_match and isHTML:
+            regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
+            xml_encoding_match = regexp.search(xml_data)
+        if xml_encoding_match is not None:
             xml_encoding = xml_encoding_match.groups()[0].lower()
+            if isHTML:
+                self.declaredHTMLEncoding = xml_encoding
             if sniffed_xml_encoding and \
                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
@@ -1916,5 +1961,5 @@
 #By default, act as an HTML pretty-printer.
 if __name__ == '__main__':
     import sys
-    soup = BeautifulSoup(sys.stdin.read())
+    soup = BeautifulSoup(sys.stdin)
     print soup.prettify()



More information about the tor-commits mailing list