| import warnings |
| |
| warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning) |
| |
| from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration |
| |
| import _base |
| from html5lib.constants import namespaces, DataLossWarning |
| |
| class AttrList(object): |
| def __init__(self, element): |
| self.element = element |
| self.attrs = dict(self.element.attrs) |
| def __iter__(self): |
| return self.attrs.items().__iter__() |
| def __setitem__(self, name, value): |
| "set attr", name, value |
| self.element[name] = value |
| def items(self): |
| return self.attrs.items() |
| def keys(self): |
| return self.attrs.keys() |
| def __getitem__(self, name): |
| return self.attrs[name] |
| def __contains__(self, name): |
| return name in self.attrs.keys() |
| def __eq__(self, other): |
| if len(self.keys()) != len(other.keys()): |
| return False |
| for item in self.keys(): |
| if item not in other: |
| return False |
| if self[item] != other[item]: |
| return False |
| return True |
| |
| class Element(_base.Node): |
| def __init__(self, element, soup, namespace): |
| _base.Node.__init__(self, element.name) |
| self.element = element |
| self.soup = soup |
| self.namespace = namespace |
| |
| def _nodeIndex(self, node, refNode): |
| # Finds a node by identity rather than equality |
| for index in range(len(self.element.contents)): |
| if id(self.element.contents[index]) == id(refNode.element): |
| return index |
| return None |
| |
| def appendChild(self, node): |
| if (node.element.__class__ == NavigableString and self.element.contents |
| and self.element.contents[-1].__class__ == NavigableString): |
| # Concatenate new text onto old text node |
| # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...") |
| newStr = NavigableString(self.element.contents[-1]+node.element) |
| |
| # Remove the old text node |
| # (Can't simply use .extract() by itself, because it fails if |
| # an equal text node exists within the parent node) |
| oldElement = self.element.contents[-1] |
| del self.element.contents[-1] |
| oldElement.parent = None |
| oldElement.extract() |
| |
| self.element.insert(len(self.element.contents), newStr) |
| else: |
| self.element.insert(len(self.element.contents), node.element) |
| node.parent = self |
| |
| def getAttributes(self): |
| return AttrList(self.element) |
| |
| def setAttributes(self, attributes): |
| if attributes: |
| for name, value in attributes.items(): |
| self.element[name] = value |
| |
| attributes = property(getAttributes, setAttributes) |
| |
| def insertText(self, data, insertBefore=None): |
| text = TextNode(NavigableString(data), self.soup) |
| if insertBefore: |
| self.insertBefore(text, insertBefore) |
| else: |
| self.appendChild(text) |
| |
| def insertBefore(self, node, refNode): |
| index = self._nodeIndex(node, refNode) |
| if (node.element.__class__ == NavigableString and self.element.contents |
| and self.element.contents[index-1].__class__ == NavigableString): |
| # (See comments in appendChild) |
| newStr = NavigableString(self.element.contents[index-1]+node.element) |
| oldNode = self.element.contents[index-1] |
| del self.element.contents[index-1] |
| oldNode.parent = None |
| oldNode.extract() |
| |
| self.element.insert(index-1, newStr) |
| else: |
| self.element.insert(index, node.element) |
| node.parent = self |
| |
| def removeChild(self, node): |
| index = self._nodeIndex(node.parent, node) |
| del node.parent.element.contents[index] |
| node.element.parent = None |
| node.element.extract() |
| node.parent = None |
| |
| def reparentChildren(self, newParent): |
| while self.element.contents: |
| child = self.element.contents[0] |
| child.extract() |
| if isinstance(child, Tag): |
| newParent.appendChild(Element(child, self.soup, namespaces["html"])) |
| else: |
| newParent.appendChild(TextNode(child, self.soup)) |
| |
| def cloneNode(self): |
| node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace) |
| for key,value in self.attributes: |
| node.attributes[key] = value |
| return node |
| |
| def hasContent(self): |
| return self.element.contents |
| |
| def getNameTuple(self): |
| if self.namespace == None: |
| return namespaces["html"], self.name |
| else: |
| return self.namespace, self.name |
| |
| nameTuple = property(getNameTuple) |
| |
| class TextNode(Element): |
| def __init__(self, element, soup): |
| _base.Node.__init__(self, None) |
| self.element = element |
| self.soup = soup |
| |
| def cloneNode(self): |
| raise NotImplementedError |
| |
| class TreeBuilder(_base.TreeBuilder): |
| def __init__(self, namespaceHTMLElements): |
| if namespaceHTMLElements: |
| warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) |
| _base.TreeBuilder.__init__(self, namespaceHTMLElements) |
| |
| def documentClass(self): |
| self.soup = BeautifulSoup("") |
| return Element(self.soup, self.soup, None) |
| |
| def insertDoctype(self, token): |
| name = token["name"] |
| publicId = token["publicId"] |
| systemId = token["systemId"] |
| |
| if publicId: |
| self.soup.insert(0, Declaration("DOCTYPE %s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or ""))) |
| elif systemId: |
| self.soup.insert(0, Declaration("DOCTYPE %s SYSTEM \"%s\""% |
| (name, systemId))) |
| else: |
| self.soup.insert(0, Declaration("DOCTYPE %s"%name)) |
| |
| def elementClass(self, name, namespace): |
| if namespace is not None: |
| warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) |
| return Element(Tag(self.soup, name), self.soup, namespace) |
| |
| def commentClass(self, data): |
| return TextNode(Comment(data), self.soup) |
| |
| def fragmentClass(self): |
| self.soup = BeautifulSoup("") |
| self.soup.name = "[document_fragment]" |
| return Element(self.soup, self.soup, None) |
| |
| def appendChild(self, node): |
| self.soup.insert(len(self.soup.contents), node.element) |
| |
| def testSerializer(self, element): |
| return testSerializer(element) |
| |
| def getDocument(self): |
| return self.soup |
| |
| def getFragment(self): |
| return _base.TreeBuilder.getFragment(self).element |
| |
| def testSerializer(element): |
| import re |
| rv = [] |
| def serializeElement(element, indent=0): |
| if isinstance(element, Declaration): |
| doctype_regexp = r'DOCTYPE\s+(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?' |
| m = re.compile(doctype_regexp).match(element.string) |
| assert m is not None, "DOCTYPE did not match expected format" |
| name = m.group('name') |
| publicId = m.group('publicId') |
| if publicId is not None: |
| systemId = m.group('systemId1') or "" |
| else: |
| systemId = m.group('systemId2') |
| |
| if publicId is not None or systemId is not None: |
| rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""% |
| (' '*indent, name, publicId or "", systemId or "")) |
| else: |
| rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name)) |
| |
| elif isinstance(element, BeautifulSoup): |
| if element.name == "[document_fragment]": |
| rv.append("#document-fragment") |
| else: |
| rv.append("#document") |
| |
| elif isinstance(element, Comment): |
| rv.append("|%s<!-- %s -->"%(' '*indent, element.string)) |
| elif isinstance(element, unicode): |
| rv.append("|%s\"%s\"" %(' '*indent, element)) |
| else: |
| rv.append("|%s<%s>"%(' '*indent, element.name)) |
| if element.attrs: |
| for name, value in sorted(element.attrs): |
| rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) |
| indent += 2 |
| if hasattr(element, "contents"): |
| for child in element.contents: |
| serializeElement(child, indent) |
| serializeElement(element, 0) |
| |
| return "\n".join(rv) |