HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
LinkResolver.py
Go to the documentation of this file.
1 # coding: utf-8
2 
3 """
4 HCE project, Python bindings, Distributed Tasks Manager application.
5 LinkResolver is a module class and has a main functional for link resolve.
6 
7 @package: dc_postprocessor
8 @file LinkResolver.py
9 @author Alexander Vybornyh <alexander.hce.cluster@gmail.com>
10 @link: http://hierarchical-cluster-engine.com/
11 @copyright: Copyright &copy; 2013-2017 IOIX Ukraine
12 @license: http://hierarchical-cluster-engine.com/license/
13 @since: 0.1
14 """
15 
16 import re
17 import json
18 import base64
19 import requests
20 import requests.exceptions
21 
22 from dc_postprocessor.PostProcessingModuleClass import PostProcessingModuleClass
23 
24 # This object is a run at once module for link resolve
26 
27  # # Constants for property 'LINK_RESOLVE'
28  LINK_RESOLVE_PROPERTY_NAME = 'LINK_RESOLVE'
29 
30  # Constants used in class
31  CONFIG_OPTION_METHOD = 'method'
32  CONFIG_OPTION_DELIMITER = 'delimiter'
33  CONFIG_OPTION_HEADER_FILE = 'headers_file'
34 
35  PROPERTY_NAME_METHOD = 'method'
36 
37  LINK_FIELD_NAME = 'link'
38  SEARCH_PATTERN = 'redirect_url\".*href=\"(.*)\">'
39 
40  # Constants default values
41  DEFAULT_VALUE_METHOD = 'HEAD'
42  DEFAULT_VALUE_DELIMITER = ','
43 
44  # Constants of error messages
45  ERROR_MSG_INITIALIZATION_CALLBACK = "Error initialization of callback function for get config options."
46  ERROR_MSG_INITIALIZATION_LOGGER = "Error initialization of self.logger."
47  ERROR_MSG_RESOLVE__REDIRECT_URL = "Resolve redirect url failed. Error: %s"
48  ERROR_MSG_READ_HEADER = "Error read header file. File: '%s', error: '%s', line: '%s'"
49 
50  # Default initialization
51  def __init__(self, getConfigOption=None, log=None):
52  PostProcessingModuleClass.__init__(self, getConfigOption, log)
53 
56  self.headers = None
57  self.siteProperty = None
58 
59 
60  # # initialization
61  #
62  # @param - None
63  # @return - None
64  def init(self):
65  if self.getConfigOption is None:
66  raise Exception(self.ERROR_MSG_INITIALIZATION_CALLBACK)
67 
68  if self.logger is None:
69  raise Exception(self.ERROR_MSG_INITIALIZATION_LOGGER)
70 
71  self.method = self.getConfigOption(sectionName=self.__class__.__name__,
72  optionName=self.CONFIG_OPTION_METHOD,
73  defaultValue=self.DEFAULT_VALUE_METHOD)
74 
75  self.delimiter = self.getConfigOption(sectionName=self.__class__.__name__,
76  optionName=self.CONFIG_OPTION_DELIMITER,
77  defaultValue=self.DEFAULT_VALUE_DELIMITER)
78 
79  if self.delimiter == "":
81 
82  self.headers = self.__readHeaderFile(self.getConfigOption(sectionName=self.__class__.__name__,
83  optionName=self.CONFIG_OPTION_HEADER_FILE))
84 
85 # self.logger.debug("Module parameters: method = '%s', delimiter = '%s', headers:\n%s",
86 # str(self.method), str(self.delimiter), varDump(self.headers))
87 
88 
89  # # read headers file
90  #
91  # @param fileName - the file name to read
92  # @return -None
93  def __readHeaderFile(self, fileName):
94  # variable for result
95  ret = {}
96  with open(fileName, 'r') as f:
97  for header in ''.join(f.readlines()).splitlines():
98  if not header:
99  continue
100  try:
101  key, value = header[:header.index(':')].strip(), header[header.index(':') + len(':'):].strip()
102  except Exception, err:
103  self.logger.error(self.ERROR_MSG_READ_HEADER, str(fileName), str(err), header)
104 
105  if key[0] != '#':
106  ret[key] = value
107 
108  return ret
109 
110 
111  # # resolve redirect link
112  #
113  # @param url - url for resolve redirect
114  # @return resolved link
115  def resolve(self, url):
116  # variable for result
117  ret = url
118  method = self.method
119 
120  try:
121  if self.PROPERTY_NAME_METHOD in self.siteProperty:
122  methods = self.siteProperty[self.PROPERTY_NAME_METHOD]
123  for pattern, value in methods.items():
124  if re.search(pattern, url, re.I + re.U) is not None:
125  method = value
126  break
127 
128  self.logger.debug("Apply method: '%s' for %s", str(method), str(url))
129 
130  req = requests.Request(method=method, url=url, headers=self.headers)
131  r = req.prepare()
132  s = requests.Session()
133  res = s.send(r, allow_redirects=True)
134  ret = res.request.url
135 
136  if res.content != "":
137  match = re.search(self.SEARCH_PATTERN, res.content, re.I + re.U)
138  if match is not None:
139  ret = match.group(1)
140 
141  except requests.exceptions.RequestException, err:
142  self.logger.error(self.ERROR_MSG_RESOLVE__REDIRECT_URL, str(err))
143  except Exception, err:
144  self.logger.error(self.ERROR_MSG_RESOLVE__REDIRECT_URL, str(err))
145 
146  return ret
147 
148 
149  # # process batch item interface method
150  #
151  # @param batchItemObj - batch item instance
152  # @return - None
153  def processBatchItem(self, batchItem):
154 
155  if self.LINK_RESOLVE_PROPERTY_NAME in batchItem.properties:
156  self.siteProperty = batchItem.properties[self.LINK_RESOLVE_PROPERTY_NAME]
157  self.logger.debug("!!! self.siteProperty: %s, type: %s", str(self.siteProperty), str(type(self.siteProperty)))
158 
159  if batchItem.urlContentResponse is not None and isinstance(batchItem.urlContentResponse.processedContents, list):
160  for index in xrange(len(batchItem.urlContentResponse.processedContents)):
161  if isinstance(batchItem.urlContentResponse.processedContents[index], basestring) and \
162  batchItem.urlContentResponse.processedContents[index] != "":
163  # unpack processed content
164  processedContent = json.loads(base64.b64decode(batchItem.urlContentResponse.processedContents[index]))
165 
166  # search and call resolve link method
167  if self.LINK_FIELD_NAME in processedContent:
168  links = processedContent[self.LINK_FIELD_NAME].split(self.delimiter)
169  rlinks = []
170  for link in links:
171  rlinks.append(self.resolve(link))
172 
173  processedContent[self.LINK_FIELD_NAME] = self.delimiter.join(rlinks)
174  # pack updated processed content
175  batchItem.urlContentResponse.processedContents[index] = base64.b64encode(json.dumps(processedContent))
176 
177  return batchItem
logger
-mask-info
Definition: join.py:1
getConfigOption