HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
ftest_SimpleCharsetDetector.py
Go to the documentation of this file.
1 #coding: utf-8
2 '''
3 HCE project, Python bindings, DRCE module
4 Event objects functional tests.
5 
6 @package: drce
7 @author bgv bgv.hce@gmail.com
8 @link: http://hierarchical-cluster-engine.com/
9 @copyright: Copyright © 2015 IOIX Ukraine
10 @license: http://hierarchical-cluster-engine.com/license/
11 @since: 0.1
12 '''
13 
14 
15 import re
16 # #The Response class
17 # represents an web page response
18 class SimpleCharsetDetector(object):
19 
20 
21  def __init__(self, content=None):
22  #content
23  self.content = content
24 
25  def detect(self, content=None):
26  ret = None
27 
28  try:
29  if content is None:
30  cnt = self.content
31  else:
32  cnt = content
33 
34  pattern = r'<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s"\']*)?([^>]*?)[\s"\';]*charset\s*=[\s"\']*([^\s"\'/>]*)'
35  matchObj = re.search(pattern, cnt, re.I | re.M | re.S)
36  if matchObj:
37  ret = matchObj.group(2)
38 
39  except Exception, err:
40  del err
41 
42  return ret
43 
44 
45 print SimpleCharsetDetector().detect("<html>\n" + '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />' + "\n")
46 print SimpleCharsetDetector().detect('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')