[or-cvs] r18284: {torflow} Update BeautifulSoup to 3.0.7a. We're going to stay away fro (torflow/trunk/NetworkScanners/libs/BeautifulSoup)
mikeperry at seul.org
mikeperry at seul.org
Wed Jan 28 04:17:46 UTC 2009
Author: mikeperry
Date: 2009-01-27 23:17:46 -0500 (Tue, 27 Jan 2009)
New Revision: 18284
Modified:
torflow/trunk/NetworkScanners/libs/BeautifulSoup/BeautifulSoup.py
Log:
Update BeautifulSoup to 3.0.7a. We're going to stay away from
3.1 for now since it is worse at parsing bad html and has
some compatibility issues.
Modified: torflow/trunk/NetworkScanners/libs/BeautifulSoup/BeautifulSoup.py
===================================================================
--- torflow/trunk/NetworkScanners/libs/BeautifulSoup/BeautifulSoup.py 2009-01-28 00:03:47 UTC (rev 18283)
+++ torflow/trunk/NetworkScanners/libs/BeautifulSoup/BeautifulSoup.py 2009-01-28 04:17:46 UTC (rev 18284)
@@ -42,7 +42,7 @@
Here, have some legalese:
-Copyright (c) 2004-2007, Leonard Richardson
+Copyright (c) 2004-2008, Leonard Richardson
All rights reserved.
@@ -79,12 +79,13 @@
from __future__ import generators
__author__ = "Leonard Richardson (leonardr at segfault.org)"
-__version__ = "3.0.5"
-__copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
+__version__ = "3.0.7a"
+__copyright__ = "Copyright (c) 2004-2008 Leonard Richardson"
__license__ = "New-style BSD"
from sgmllib import SGMLParser, SGMLParseError
import codecs
+import markupbase
import types
import re
import sgmllib
@@ -92,9 +93,14 @@
from htmlentitydefs import name2codepoint
except ImportError:
name2codepoint = {}
+try:
+ set
+except NameError:
+ from sets import Set as set
-#This hack makes Beautiful Soup able to parse XML with namespaces
+#These hacks make Beautiful Soup able to parse XML with namespaces
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
+markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
DEFAULT_OUTPUT_ENCODING = "utf-8"
@@ -157,6 +163,7 @@
if self.nextSibling:
self.nextSibling.previousSibling = self.previousSibling
self.previousSibling = self.nextSibling = None
+ return self
def _lastRecursiveChild(self):
"Finds the last element beneath this object to be parsed."
@@ -237,8 +244,9 @@
def findAllNext(self, name=None, attrs={}, text=None, limit=None,
**kwargs):
"""Returns all items that match the given criteria and appear
- before after Tag in the document."""
- return self._findAll(name, attrs, text, limit, self.nextGenerator)
+ after this Tag in the document."""
+ return self._findAll(name, attrs, text, limit, self.nextGenerator,
+ **kwargs)
def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the closest sibling to this Tag that matches the
@@ -389,6 +397,18 @@
class NavigableString(unicode, PageElement):
+ def __new__(cls, value):
+ """Create a new NavigableString.
+
+ When unpickling a NavigableString, this method is called with
+ the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
+ passed in to the superclass's __new__ or the superclass won't know
+ how to handle non-ASCII characters.
+ """
+ if isinstance(value, unicode):
+ return unicode.__new__(cls, value)
+ return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+
def __getnewargs__(self):
return (NavigableString.__str__(self),)
@@ -402,7 +422,7 @@
raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
def __unicode__(self):
- return unicode(str(self))
+ return str(self).decode(DEFAULT_OUTPUT_ENCODING)
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
if encoding:
@@ -688,6 +708,16 @@
s = ''.join(s)
return s
+ def decompose(self):
+ """Recursively destroys the contents of this tree."""
+ contents = [i for i in self.contents]
+ for i in contents:
+ if isinstance(i, Tag):
+ i.decompose()
+ else:
+ i.extract()
+ self.extract()
+
def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
return self.__str__(encoding, True)
@@ -970,6 +1000,7 @@
NESTABLE_TAGS = {}
RESET_NESTING_TAGS = {}
QUOTE_TAGS = {}
+ PRESERVE_WHITESPACE_TAGS = []
MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
lambda x: x.group(1) + ' />'),
@@ -993,7 +1024,7 @@
def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
markupMassage=True, smartQuotesTo=XML_ENTITIES,
- convertEntities=None, selfClosingTags=None):
+ convertEntities=None, selfClosingTags=None, isHTML=False):
"""The Soup object is initialized as the 'root tag', and the
provided markup (which can be a string or a file-like object)
is fed into the underlying parser.
@@ -1055,7 +1086,7 @@
self.markup = markup
self.markupMassage = markupMassage
try:
- self._feed()
+ self._feed(isHTML=isHTML)
except StopParsing:
pass
self.markup = None # The markup can now be GCed
@@ -1070,7 +1101,7 @@
return
return self.convert_codepoint(n)
- def _feed(self, inDocumentEncoding=None):
+ def _feed(self, inDocumentEncoding=None, isHTML=False):
# Convert the document to Unicode.
markup = self.markup
if isinstance(markup, unicode):
@@ -1079,9 +1110,10 @@
else:
dammit = UnicodeDammit\
(markup, [self.fromEncoding, inDocumentEncoding],
- smartQuotesTo=self.smartQuotesTo)
+ smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
markup = dammit.unicode
self.originalEncoding = dammit.originalEncoding
+ self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
if markup:
if self.markupMassage:
if not isList(self.markupMassage):
@@ -1154,8 +1186,10 @@
def endData(self, containerClass=NavigableString):
if self.currentData:
- currentData = ''.join(self.currentData)
- if not currentData.translate(self.STRIP_ASCII_SPACES):
+ currentData = u''.join(self.currentData)
+ if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
+ not set([tag.name for tag in self.tagStack]).intersection(
+ self.PRESERVE_WHITESPACE_TAGS)):
if '\n' in currentData:
currentData = '\n'
else:
@@ -1432,12 +1466,15 @@
def __init__(self, *args, **kwargs):
if not kwargs.has_key('smartQuotesTo'):
kwargs['smartQuotesTo'] = self.HTML_ENTITIES
+ kwargs['isHTML'] = True
BeautifulStoneSoup.__init__(self, *args, **kwargs)
SELF_CLOSING_TAGS = buildTagMap(None,
['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
+ PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
+
QUOTE_TAGS = {'script' : None, 'textarea' : None}
#According to the HTML standard, each of these inline tags can
@@ -1482,7 +1519,7 @@
NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
# Used to detect the charset in a META tag; see start_meta
- CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
+ CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
def start_meta(self, attrs):
"""Beautiful Soup can detect a charset included in a META tag,
@@ -1505,25 +1542,28 @@
if httpEquiv and contentType: # It's an interesting meta tag.
match = self.CHARSET_RE.search(contentType)
if match:
- if getattr(self, 'declaredHTMLEncoding') or \
- (self.originalEncoding == self.fromEncoding):
- # This is our second pass through the document, or
- # else an encoding was specified explicitly and it
- # worked. Rewrite the meta tag.
- newAttr = self.CHARSET_RE.sub\
- (lambda(match):match.group(1) +
- "%SOUP-ENCODING%", value)
+ if (self.declaredHTMLEncoding is not None or
+ self.originalEncoding == self.fromEncoding):
+ # An HTML encoding was sniffed while converting
+ # the document to Unicode, or an HTML encoding was
+ # sniffed during a previous pass through the
+ # document, or an encoding was specified
+ # explicitly and it worked. Rewrite the meta tag.
+ def rewrite(match):
+ return match.group(1) + "%SOUP-ENCODING%"
+ newAttr = self.CHARSET_RE.sub(rewrite, contentType)
attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
newAttr)
tagNeedsEncodingSubstitution = True
else:
# This is our first pass through the document.
- # Go through it again with the new information.
+ # Go through it again with the encoding information.
newCharset = match.group(3)
if newCharset and newCharset != self.originalEncoding:
self.declaredHTMLEncoding = newCharset
self._feed(self.declaredHTMLEncoding)
raise StopParsing
+ pass
tag = self.unknown_starttag("meta", attrs)
if tag and tagNeedsEncodingSubstitution:
tag.containsSubstitutions = True
@@ -1646,20 +1686,19 @@
import chardet
# import chardet.constants
# chardet.constants._debug = 1
-except:
+except ImportError:
chardet = None
-chardet = None
# cjkcodecs and iconv_codec make Python know about more character encodings.
# Both are available from http://cjkpython.i18n.org/
# They're built in if you use Python 2.4.
try:
import cjkcodecs.aliases
-except:
+except ImportError:
pass
try:
import iconv_codec
-except:
+except ImportError:
pass
class UnicodeDammit:
@@ -1676,9 +1715,10 @@
"x-sjis" : "shift-jis" }
def __init__(self, markup, overrideEncodings=[],
- smartQuotesTo='xml'):
+ smartQuotesTo='xml', isHTML=False):
+ self.declaredHTMLEncoding = None
self.markup, documentEncoding, sniffedEncoding = \
- self._detectEncoding(markup)
+ self._detectEncoding(markup, isHTML)
self.smartQuotesTo = smartQuotesTo
self.triedEncodings = []
if markup == '' or isinstance(markup, unicode):
@@ -1704,6 +1744,7 @@
for proposed_encoding in ("utf-8", "windows-1252"):
u = self._convertFrom(proposed_encoding)
if u: break
+
self.unicode = u
if not u: self.originalEncoding = None
@@ -1771,7 +1812,7 @@
newdata = unicode(data, encoding)
return newdata
- def _detectEncoding(self, xml_data):
+ def _detectEncoding(self, xml_data, isHTML=False):
"""Given a document, tries to detect its XML encoding."""
xml_encoding = sniffed_xml_encoding = None
try:
@@ -1819,13 +1860,17 @@
else:
sniffed_xml_encoding = 'ascii'
pass
- xml_encoding_match = re.compile \
- ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
- .match(xml_data)
except:
xml_encoding_match = None
- if xml_encoding_match:
+ xml_encoding_match = re.compile(
+ '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
+ if not xml_encoding_match and isHTML:
+ regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
+ xml_encoding_match = regexp.search(xml_data)
+ if xml_encoding_match is not None:
xml_encoding = xml_encoding_match.groups()[0].lower()
+ if isHTML:
+ self.declaredHTMLEncoding = xml_encoding
if sniffed_xml_encoding and \
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
@@ -1916,5 +1961,5 @@
#By default, act as an HTML pretty-printer.
if __name__ == '__main__':
import sys
- soup = BeautifulSoup(sys.stdin.read())
+ soup = BeautifulSoup(sys.stdin)
print soup.prettify()
More information about the tor-commits
mailing list