| import support |
| import unittest, codecs |
| |
| from html5lib.inputstream import HTMLInputStream |
| |
| class HTMLInputStreamShortChunk(HTMLInputStream): |
| _defaultChunkSize = 2 |
| |
| class HTMLInputStreamTest(unittest.TestCase): |
| |
| def test_char_ascii(self): |
| stream = HTMLInputStream("'", encoding='ascii') |
| self.assertEquals(stream.charEncoding[0], 'ascii') |
| self.assertEquals(stream.char(), "'") |
| |
| def test_char_null(self): |
| stream = HTMLInputStream("\x00") |
| self.assertEquals(stream.char(), u'\ufffd') |
| |
| def test_char_utf8(self): |
| stream = HTMLInputStream(u'\u2018'.encode('utf-8'), encoding='utf-8') |
| self.assertEquals(stream.charEncoding[0], 'utf-8') |
| self.assertEquals(stream.char(), u'\u2018') |
| |
| def test_char_win1252(self): |
| stream = HTMLInputStream(u"\xa9\xf1\u2019".encode('windows-1252')) |
| self.assertEquals(stream.charEncoding[0], 'windows-1252') |
| self.assertEquals(stream.char(), u"\xa9") |
| self.assertEquals(stream.char(), u"\xf1") |
| self.assertEquals(stream.char(), u"\u2019") |
| |
| def test_bom(self): |
| stream = HTMLInputStream(codecs.BOM_UTF8 + "'") |
| self.assertEquals(stream.charEncoding[0], 'utf-8') |
| self.assertEquals(stream.char(), "'") |
| |
| def test_utf_16(self): |
| stream = HTMLInputStream((' '*1025).encode('utf-16')) |
| self.assert_(stream.charEncoding[0] in ['utf-16-le', 'utf-16-be'], stream.charEncoding) |
| self.assertEquals(len(stream.charsUntil(' ', True)), 1025) |
| |
| def test_newlines(self): |
| stream = HTMLInputStreamShortChunk(codecs.BOM_UTF8 + "a\nbb\r\nccc\rddddxe") |
| self.assertEquals(stream.position(), (1, 0)) |
| self.assertEquals(stream.charsUntil('c'), u"a\nbb\n") |
| self.assertEquals(stream.position(), (3, 0)) |
| self.assertEquals(stream.charsUntil('x'), u"ccc\ndddd") |
| self.assertEquals(stream.position(), (4, 4)) |
| self.assertEquals(stream.charsUntil('e'), u"x") |
| self.assertEquals(stream.position(), (4, 5)) |
| |
| def test_newlines2(self): |
| size = HTMLInputStream._defaultChunkSize |
| stream = HTMLInputStream("\r" * size + "\n") |
| self.assertEquals(stream.charsUntil('x'), "\n" * size) |
| |
| def test_position(self): |
| stream = HTMLInputStreamShortChunk(codecs.BOM_UTF8 + "a\nbb\nccc\nddde\nf\ngh") |
| self.assertEquals(stream.position(), (1, 0)) |
| self.assertEquals(stream.charsUntil('c'), u"a\nbb\n") |
| self.assertEquals(stream.position(), (3, 0)) |
| stream.unget(u"\n") |
| self.assertEquals(stream.position(), (2, 2)) |
| self.assertEquals(stream.charsUntil('c'), u"\n") |
| self.assertEquals(stream.position(), (3, 0)) |
| stream.unget(u"\n") |
| self.assertEquals(stream.position(), (2, 2)) |
| self.assertEquals(stream.char(), u"\n") |
| self.assertEquals(stream.position(), (3, 0)) |
| self.assertEquals(stream.charsUntil('e'), u"ccc\nddd") |
| self.assertEquals(stream.position(), (4, 3)) |
| self.assertEquals(stream.charsUntil('h'), u"e\nf\ng") |
| self.assertEquals(stream.position(), (6, 1)) |
| |
| def test_position2(self): |
| stream = HTMLInputStreamShortChunk("abc\nd") |
| self.assertEquals(stream.position(), (1, 0)) |
| self.assertEquals(stream.char(), u"a") |
| self.assertEquals(stream.position(), (1, 1)) |
| self.assertEquals(stream.char(), u"b") |
| self.assertEquals(stream.position(), (1, 2)) |
| self.assertEquals(stream.char(), u"c") |
| self.assertEquals(stream.position(), (1, 3)) |
| self.assertEquals(stream.char(), u"\n") |
| self.assertEquals(stream.position(), (2, 0)) |
| self.assertEquals(stream.char(), u"d") |
| self.assertEquals(stream.position(), (2, 1)) |
| |
| def buildTestSuite(): |
| return unittest.defaultTestLoader.loadTestsFromName(__name__) |
| |
| def main(): |
| buildTestSuite() |
| unittest.main() |
| |
| if __name__ == '__main__': |
| main() |