| try: |
| frozenset |
| except NameError: |
| # Import from the sets module for python 2.3 |
| from sets import ImmutableSet as frozenset |
| |
| import re |
| |
| import _base |
| from html5lib.constants import rcdataElements, spaceCharacters |
| spaceCharacters = u"".join(spaceCharacters) |
| |
| SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters) |
| |
| class Filter(_base.Filter): |
| |
| spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements)) |
| |
| def __iter__(self): |
| preserve = 0 |
| for token in _base.Filter.__iter__(self): |
| type = token["type"] |
| if type == "StartTag" \ |
| and (preserve or token["name"] in self.spacePreserveElements): |
| preserve += 1 |
| |
| elif type == "EndTag" and preserve: |
| preserve -= 1 |
| |
| elif not preserve and type == "SpaceCharacters" and token["data"]: |
| # Test on token["data"] above to not introduce spaces where there were not |
| token["data"] = u" " |
| |
| elif not preserve and type == "Characters": |
| token["data"] = collapse_spaces(token["data"]) |
| |
| yield token |
| |
| def collapse_spaces(text): |
| return SPACES_REGEX.sub(' ', text) |
| |