HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_postprocessor.LinkResolver.LinkResolver Class Reference
Inheritance diagram for dc_postprocessor.LinkResolver.LinkResolver:
Collaboration diagram for dc_postprocessor.LinkResolver.LinkResolver:

Public Member Functions

def __init__ (self, getConfigOption=None, log=None)
 
def init (self)
 
def resolve (self, url)
 
def processBatchItem (self, batchItem)
 
- Public Member Functions inherited from dc_postprocessor.PostProcessingModuleClass.PostProcessingModuleClass
def __init__ (self, getConfigOption=None, log=None)
 
def init (self)
 
def processBatch (self, batchObj)
 
def processBatchItem (self, batchItemObj)
 

Public Attributes

 method
 
 delimiter
 
 headers
 
 siteProperty
 
- Public Attributes inherited from dc_postprocessor.PostProcessingModuleClass.PostProcessingModuleClass
 getConfigOption
 
 logger
 

Static Public Attributes

string LINK_RESOLVE_PROPERTY_NAME = 'LINK_RESOLVE'
 
string CONFIG_OPTION_METHOD = 'method'
 
string CONFIG_OPTION_DELIMITER = 'delimiter'
 
string CONFIG_OPTION_HEADER_FILE = 'headers_file'
 
string PROPERTY_NAME_METHOD = 'method'
 
string LINK_FIELD_NAME = 'link'
 
string SEARCH_PATTERN = 'redirect_url\".*href=\"(.*)\">'
 
string DEFAULT_VALUE_METHOD = 'HEAD'
 
string DEFAULT_VALUE_DELIMITER = ','
 
string ERROR_MSG_INITIALIZATION_CALLBACK = "Error initialization of callback function for get config options."
 
string ERROR_MSG_INITIALIZATION_LOGGER = "Error initialization of self.logger."
 
string ERROR_MSG_RESOLVE__REDIRECT_URL = "Resolve redirect url failed. Error: %s"
 
string ERROR_MSG_READ_HEADER = "Error read header file. File: '%s', error: '%s', line: '%s'"
 

Private Member Functions

def __readHeaderFile (self, fileName)
 

Detailed Description

Definition at line 25 of file LinkResolver.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_postprocessor.LinkResolver.LinkResolver.__init__ (   self,
  getConfigOption = None,
  log = None 
)

Definition at line 51 of file LinkResolver.py.

51  def __init__(self, getConfigOption=None, log=None):
52  PostProcessingModuleClass.__init__(self, getConfigOption, log)
53 
54  self.method = self.DEFAULT_VALUE_METHOD
55  self.delimiter = self.DEFAULT_VALUE_DELIMITER
56  self.headers = None
57  self.siteProperty = None
58 
59 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ __readHeaderFile()

def dc_postprocessor.LinkResolver.LinkResolver.__readHeaderFile (   self,
  fileName 
)
private

Definition at line 93 of file LinkResolver.py.

93  def __readHeaderFile(self, fileName):
94  # variable for result
95  ret = {}
96  with open(fileName, 'r') as f:
97  for header in ''.join(f.readlines()).splitlines():
98  if not header:
99  continue
100  try:
101  key, value = header[:header.index(':')].strip(), header[header.index(':') + len(':'):].strip()
102  except Exception, err:
103  self.logger.error(self.ERROR_MSG_READ_HEADER, str(fileName), str(err), header)
104 
105  if key[0] != '#':
106  ret[key] = value
107 
108  return ret
109 
110 
-mask-info
Definition: join.py:1
Here is the caller graph for this function:

◆ init()

def dc_postprocessor.LinkResolver.LinkResolver.init (   self)

Definition at line 64 of file LinkResolver.py.

64  def init(self):
65  if self.getConfigOption is None:
66  raise Exception(self.ERROR_MSG_INITIALIZATION_CALLBACK)
67 
68  if self.logger is None:
69  raise Exception(self.ERROR_MSG_INITIALIZATION_LOGGER)
70 
71  self.method = self.getConfigOption(sectionName=self.__class__.__name__,
72  optionName=self.CONFIG_OPTION_METHOD,
73  defaultValue=self.DEFAULT_VALUE_METHOD)
74 
75  self.delimiter = self.getConfigOption(sectionName=self.__class__.__name__,
76  optionName=self.CONFIG_OPTION_DELIMITER,
77  defaultValue=self.DEFAULT_VALUE_DELIMITER)
78 
79  if self.delimiter == "":
80  self.delimiter = self.DEFAULT_VALUE_DELIMITER
81 
82  self.headers = self.__readHeaderFile(self.getConfigOption(sectionName=self.__class__.__name__,
83  optionName=self.CONFIG_OPTION_HEADER_FILE))
84 
85 # self.logger.debug("Module parameters: method = '%s', delimiter = '%s', headers:\n%s",
86 # str(self.method), str(self.delimiter), varDump(self.headers))
87 
88 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ processBatchItem()

def dc_postprocessor.LinkResolver.LinkResolver.processBatchItem (   self,
  batchItem 
)

Definition at line 153 of file LinkResolver.py.

153  def processBatchItem(self, batchItem):
154 
155  if self.LINK_RESOLVE_PROPERTY_NAME in batchItem.properties:
156  self.siteProperty = batchItem.properties[self.LINK_RESOLVE_PROPERTY_NAME]
157  self.logger.debug("!!! self.siteProperty: %s, type: %s", str(self.siteProperty), str(type(self.siteProperty)))
158 
159  if batchItem.urlContentResponse is not None and isinstance(batchItem.urlContentResponse.processedContents, list):
160  for index in xrange(len(batchItem.urlContentResponse.processedContents)):
161  if isinstance(batchItem.urlContentResponse.processedContents[index], basestring) and \
162  batchItem.urlContentResponse.processedContents[index] != "":
163  # unpack processed content
164  processedContent = json.loads(base64.b64decode(batchItem.urlContentResponse.processedContents[index]))
165 
166  # search and call resolve link method
167  if self.LINK_FIELD_NAME in processedContent:
168  links = processedContent[self.LINK_FIELD_NAME].split(self.delimiter)
169  rlinks = []
170  for link in links:
171  rlinks.append(self.resolve(link))
172 
173  processedContent[self.LINK_FIELD_NAME] = self.delimiter.join(rlinks)
174  # pack updated processed content
175  batchItem.urlContentResponse.processedContents[index] = base64.b64encode(json.dumps(processedContent))
176 
177  return batchItem
Here is the call graph for this function:

◆ resolve()

def dc_postprocessor.LinkResolver.LinkResolver.resolve (   self,
  url 
)

Definition at line 115 of file LinkResolver.py.

115  def resolve(self, url):
116  # variable for result
117  ret = url
118  method = self.method
119 
120  try:
121  if self.PROPERTY_NAME_METHOD in self.siteProperty:
122  methods = self.siteProperty[self.PROPERTY_NAME_METHOD]
123  for pattern, value in methods.items():
124  if re.search(pattern, url, re.I + re.U) is not None:
125  method = value
126  break
127 
128  self.logger.debug("Apply method: '%s' for %s", str(method), str(url))
129 
130  req = requests.Request(method=method, url=url, headers=self.headers)
131  r = req.prepare()
132  s = requests.Session()
133  res = s.send(r, allow_redirects=True)
134  ret = res.request.url
135 
136  if res.content != "":
137  match = re.search(self.SEARCH_PATTERN, res.content, re.I + re.U)
138  if match is not None:
139  ret = match.group(1)
140 
141  except requests.exceptions.RequestException, err:
142  self.logger.error(self.ERROR_MSG_RESOLVE__REDIRECT_URL, str(err))
143  except Exception, err:
144  self.logger.error(self.ERROR_MSG_RESOLVE__REDIRECT_URL, str(err))
145 
146  return ret
147 
148 
-mask-info
Here is the caller graph for this function:

Member Data Documentation

◆ CONFIG_OPTION_DELIMITER

string dc_postprocessor.LinkResolver.LinkResolver.CONFIG_OPTION_DELIMITER = 'delimiter'
static

Definition at line 32 of file LinkResolver.py.

◆ CONFIG_OPTION_HEADER_FILE

string dc_postprocessor.LinkResolver.LinkResolver.CONFIG_OPTION_HEADER_FILE = 'headers_file'
static

Definition at line 33 of file LinkResolver.py.

◆ CONFIG_OPTION_METHOD

string dc_postprocessor.LinkResolver.LinkResolver.CONFIG_OPTION_METHOD = 'method'
static

Definition at line 31 of file LinkResolver.py.

◆ DEFAULT_VALUE_DELIMITER

string dc_postprocessor.LinkResolver.LinkResolver.DEFAULT_VALUE_DELIMITER = ','
static

Definition at line 42 of file LinkResolver.py.

◆ DEFAULT_VALUE_METHOD

string dc_postprocessor.LinkResolver.LinkResolver.DEFAULT_VALUE_METHOD = 'HEAD'
static

Definition at line 41 of file LinkResolver.py.

◆ delimiter

dc_postprocessor.LinkResolver.LinkResolver.delimiter

Definition at line 55 of file LinkResolver.py.

◆ ERROR_MSG_INITIALIZATION_CALLBACK

string dc_postprocessor.LinkResolver.LinkResolver.ERROR_MSG_INITIALIZATION_CALLBACK = "Error initialization of callback function for get config options."
static

Definition at line 45 of file LinkResolver.py.

◆ ERROR_MSG_INITIALIZATION_LOGGER

string dc_postprocessor.LinkResolver.LinkResolver.ERROR_MSG_INITIALIZATION_LOGGER = "Error initialization of self.logger."
static

Definition at line 46 of file LinkResolver.py.

◆ ERROR_MSG_READ_HEADER

string dc_postprocessor.LinkResolver.LinkResolver.ERROR_MSG_READ_HEADER = "Error read header file. File: '%s', error: '%s', line: '%s'"
static

Definition at line 48 of file LinkResolver.py.

◆ ERROR_MSG_RESOLVE__REDIRECT_URL

string dc_postprocessor.LinkResolver.LinkResolver.ERROR_MSG_RESOLVE__REDIRECT_URL = "Resolve redirect url failed. Error: %s"
static

Definition at line 47 of file LinkResolver.py.

◆ headers

dc_postprocessor.LinkResolver.LinkResolver.headers

Definition at line 56 of file LinkResolver.py.

◆ LINK_FIELD_NAME

string dc_postprocessor.LinkResolver.LinkResolver.LINK_FIELD_NAME = 'link'
static

Definition at line 37 of file LinkResolver.py.

◆ LINK_RESOLVE_PROPERTY_NAME

string dc_postprocessor.LinkResolver.LinkResolver.LINK_RESOLVE_PROPERTY_NAME = 'LINK_RESOLVE'
static

Definition at line 28 of file LinkResolver.py.

◆ method

dc_postprocessor.LinkResolver.LinkResolver.method

Definition at line 54 of file LinkResolver.py.

◆ PROPERTY_NAME_METHOD

string dc_postprocessor.LinkResolver.LinkResolver.PROPERTY_NAME_METHOD = 'method'
static

Definition at line 35 of file LinkResolver.py.

◆ SEARCH_PATTERN

string dc_postprocessor.LinkResolver.LinkResolver.SEARCH_PATTERN = 'redirect_url\".*href=\"(.*)\">'
static

Definition at line 38 of file LinkResolver.py.

◆ siteProperty

dc_postprocessor.LinkResolver.LinkResolver.siteProperty

Definition at line 57 of file LinkResolver.py.


The documentation for this class was generated from the following file: