Definition at line 1650 of file Fetcher.py.
◆ __init__()
def dc_crawler.Fetcher.SimpleCharsetDetector.__init__ |
( |
|
self, |
|
|
|
content = None |
|
) |
| |
Definition at line 1653 of file Fetcher.py.
1655 self.content = content
def __init__(self)
constructor
◆ detect()
def dc_crawler.Fetcher.SimpleCharsetDetector.detect |
( |
|
self, |
|
|
|
content = None , |
|
|
|
contentType = "html" |
|
) |
| |
Definition at line 1657 of file Fetcher.py.
1657 def detect(self, content=None, contentType="html"):
1666 if contentType ==
'html':
1667 pattern =
r'<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s"\']*)?([^>]*?)[\s"\';]*charset\s*=[\s"\']*([^\s"\'/>]*)' 1668 matchObj = re.search(pattern, cnt, re.I | re.M | re.S)
1670 ret = matchObj.group(2)
1671 elif contentType ==
'xml':
1672 ret = self.xmlCharsetDetector(
None, cnt)
1674 except Exception, err:
1675 logger.error(
"Exception: %s", str(err))
1677 if ret
is not None and ret
in CONSTS.charsetDetectorMap:
1678 logger.debug(
"Extracted wrong encoding '%s' from page replace to correct '%s'", ret,
1679 CONSTS.charsetDetectorMap[ret])
1680 ret = CONSTS.charsetDetectorMap[ret]
◆ xmlCharsetDetector()
def dc_crawler.Fetcher.SimpleCharsetDetector.xmlCharsetDetector |
( |
|
self, |
|
|
|
fp, |
|
|
|
buff = None |
|
) |
| |
Attempts to detect the character encoding of the xml file
given by a file object fp. fp must not be a codec wrapped file
object!
The return value can be:
- if detection of the BOM succeeds, the codec name of the
corresponding unicode charset is returned
- if BOM detection fails, the xml declaration is searched for
the encoding attribute and its value returned. the "<"
character has to be the very first in the file then (it's xml
standard after all).
- if BOM and xml declaration fail, None is returned. According
to xml 1.0 it should be utf_8 then, but it wasn't detected by
the means offered here. at least one can be pretty sure that a
character coding including most of ASCII is used :-/
Definition at line 1685 of file Fetcher.py.
1685 def xmlCharsetDetector(self, fp, buff=None):
1686 """ Attempts to detect the character encoding of the xml file 1687 given by a file object fp. fp must not be a codec wrapped file 1690 The return value can be: 1691 - if detection of the BOM succeeds, the codec name of the 1692 corresponding unicode charset is returned 1694 - if BOM detection fails, the xml declaration is searched for 1695 the encoding attribute and its value returned. the "<" 1696 character has to be the very first in the file then (it's xml 1697 standard after all). 1699 - if BOM and xml declaration fail, None is returned. According 1700 to xml 1.0 it should be utf_8 then, but it wasn't detected by 1701 the means offered here. at least one can be pretty sure that a 1702 character coding including most of ASCII is used :-/ 1708 (0x00, 0x00, 0xFE, 0xFF) :
"utf_32_be",
1709 (0xFF, 0xFE, 0x00, 0x00) :
"utf_32_le",
1710 (0xFE, 0xFF,
None,
None) :
"utf_16_be",
1711 (0xFF, 0xFE,
None,
None) :
"utf_16_le",
1712 (0xEF, 0xBB, 0xBF,
None) :
"utf_8",
1719 (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
1722 bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
1723 if not bomDetection :
1724 bomDetection = bomDict.get((byte1, byte2, byte3,
None))
1725 if not bomDetection :
1726 bomDetection = bomDict.get((byte1, byte2,
None,
None))
1740 buff = fp.read(2048)
1743 xmlDeclPattern =
r""" 1744 ^<\?xml # w/o BOM, xmldecl starts with <?xml at the first byte 1745 .+? # some chars (version info), matched minimal 1746 encoding= # encoding attribute begins 1747 ["'] # attribute start delimiter 1748 (?P<encstr> # what's matched in the brackets will be named encstr 1749 [^"']+ # every character not delimiter (not overly exact!) 1750 ) # closes the brackets pair for the named group 1751 ["'] # attribute end delimiter 1752 .*? # some chars optionally (standalone decl or whitespace) 1756 xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE)
1759 match = xmlDeclRE.search(buff)
1763 return match.group(
"encstr")
◆ content
dc_crawler.Fetcher.SimpleCharsetDetector.content |
The documentation for this class was generated from the following file: