HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
base_extractor.py
Go to the documentation of this file.
1 """@package docstring
2  @file base_extractor.py
3  @author Alexey, bgv <developers.hce@gmail.com>
4  @link http://hierarchical-cluster-engine.com/
5  @copyright Copyright &copy; 2013 IOIX Ukraine
6  @license http://hierarchical-cluster-engine.com/license/
7  @package HCE project node API
8  @since 0.1
9 """
10 
11 import re
12 import json
13 import dc_processor.Constants as CONSTS
15 from app.Utils import varDump
16 from app.Utils import ExceptionLog
17 import app.Utils as Utils # pylint: disable=F0401
18 
19 # Logger initialization
20 logger = Utils.MPLogger().getLogger()
21 
22 
23 # time execution limit
24 def signal_handler(signum, frame):
25  del signum, frame
26  logger.debug("Time execution limit was reached: %s seconds.", str(CONSTS.TIME_EXECUTION_LIMIT))
27  raise Exception("Timed out!")
28 
29 
30 # Local class constants
31 ERR_MSG_ADJUST_PUB_DATE = "Error in adjustPubDate: "
32 ERR_MSG_ADJUST_MEDIA = "Error in adjustMedia: "
33 ERR_MSG_ADJUST_CONTENT_UTF8_ENCODED = "Error in adjustContentUTF8Encoded: "
34 
35 ERR_MSG_OK = ""
36 
37 EMPTY_DATE = ""
38 
39 # Adjust publication date
40 # If incoming publication date is array just return one of them (first)
41 # @param dates publication date, extracted from content
42 def adjustPubDate(dates):
43  # logger.debug("dates: %s", dates)
44  pub_date = EMPTY_DATE
45  try:
46  # TODO: improve to return most appropriate
47  # if dates and any(i.isdigit() for i in dates):
48  if isinstance(dates, list) and len(dates):
49  # pub_date = dates[0]
50  pub_date = " ".join(dates)
51  else:
52  pub_date = dates
53  if pub_date and len(dates) and not re.search(r'\d+', pub_date):
54  pub_date = EMPTY_DATE
55  except Exception as err:
56  ExceptionLog.handler(logger, err, ERR_MSG_ADJUST_PUB_DATE)
57 
58  return pub_date
59 
60 
61 # Adjust data in media tag
62 # If media are PR (partial reference) adjust path
63 # @param medias media extracted from content
64 def adjustMedia(medias):
65  return medias
66  # valid_http_url = HttpUrl()
67  # res = []
68  # try:
69  # if isinstance(medias, list):
70  # for media in medias:
71  # if valid_http_url(media):
72  # res.append(media)
73  # except Exception as err:
74  # logger.error(ERR_MSG_ADJUST_MEDIA + err.message)
75  # return res
76 
77 
78 # Adjust data in content_encoded tag
79 # If content are non-meaningfull adjust it
80 # @param data content extracted from content
82  return data
83 
84 
85 # Adjust data in content_encoded tag
86 # If content are non-meaningfull adjust it
87 # @param data content extracted from content
88 def adjustLink(data):
89  if isinstance(data, list) and len(data) > 1:
90  data = data[0]
91  return data
92 
93 
94 def adjustNone(data):
95  return data
96 
97 
98 # #The BaseExtractor class
99 # This is the base class for custom extractors
100 # Provide basic functionality such as add tag, etc.
101 class BaseExtractor(object):
102 
103  properties = None
104 
105  tag = {CONSTS.TAG_MEDIA: adjustMedia,
106  CONSTS.TAG_CONTENT_UTF8_ENCODED: adjustContentUTF8Encoded,
107  CONSTS.TAG_PUB_DATE: adjustPubDate,
108  CONSTS.TAG_TITLE: adjustNone,
109  CONSTS.TAG_LINK: adjustLink,
110  CONSTS.TAG_DESCRIPTION: adjustNone,
111  CONSTS.TAG_DC_DATE: adjustNone,
112  CONSTS.TAG_AUTHOR: adjustNone,
113  CONSTS.TAG_GUID: adjustNone,
114  CONSTS.TAG_KEYWORDS: adjustNone,
115  CONSTS.TAG_MEDIA_THUMBNAIL: adjustNone,
116  CONSTS.TAG_ENCLOSURE: adjustNone,
117  CONSTS.TAG_MEDIA_CONTENT: adjustNone,
118  CONSTS.TAG_GOOGLE: adjustNone,
119  CONSTS.TAG_GOOGLE_TOTAL: adjustNone,
120  CONSTS.HTML_LANG: adjustNone
121  }
122 
123 
124  tagsMask = {CONSTS.TAG_MEDIA: 1,
125  CONSTS.TAG_CONTENT_UTF8_ENCODED: 1 << 1, CONSTS.CONTENT: 1 << 1,
126  CONSTS.TAG_PUB_DATE: 1 << 2, CONSTS.PUBLISHED: 1 << 2,
127  CONSTS.TAG_TITLE: 1 << 3,
128 
129  CONSTS.TAG_LINK: 1 << 4,
130  CONSTS.TAG_DESCRIPTION: 1 << 5,
131  CONSTS.UPDATED_PARSED: 1 << 6,
132  CONSTS.TAG_DC_DATE: 1 << 7,
133 
134  CONSTS.TAG_AUTHOR: 1 << 8,
135  CONSTS.TAG_GUID: 1 << 9,
136  CONSTS.TAG_KEYWORDS: 1 << 10,
137  CONSTS.TAG_MEDIA_THUMBNAIL: 1 << 11,
138 
139  CONSTS.TAG_ENCLOSURE: 1 << 12,
140  CONSTS.TAG_MEDIA_CONTENT: 1 << 13,
141  CONSTS.TAG_GOOGLE: 1 << 14,
142  CONSTS.TAG_GOOGLE_TOTAL: 1 << 15,
143 
144  CONSTS.HTML_LANG: 1 << 16,
145  CONSTS.PARENT_RSS_FEED: 1 << 17,
146  CONSTS.PARENT_RSS_FEED_URLMD5: 1 << 18,
147  CONSTS.SUMMARY_DETAIL: 1 << 19,
148 
149  CONSTS.SUMMARY: 1 << 20,
150  CONSTS.COMMENTNS: 1 << 21,
151  CONSTS.TAGS: 1 << 22,
152  CONSTS.UPDATED: 1 << 23,
153 
154  CONSTS.TAG_ORDER_NUMBER: 1 << 24,
155  CONSTS.TAG_SOURCE_URL: 1 << 25
156  }
157 
158 
159  # # class constructor
160  #
161  def __init__(self, config, templ=None, domain=None, processorProperties=None): # pylint: disable=W0612,W0613
162  self.config = config
163  self.processorProperties = processorProperties
164  self.properties = None
165  scraperPropFileName = self.config.get("Application", "property_file_name")
166 
167  if scraperPropFileName is not None:
168  self.loadScraperProperties(scraperPropFileName)
169 
170  self.name = "Base extractor"
171  self.rank = CONSTS.SCRAPER_RANK_INIT
172 
173  # support processing modes
174  self.process_mode = CONSTS.PROCESS_ALGORITHM_REGULAR
175  self.modules = {}
176 
177  self.data = {"extractor":"Base extractor", "data":"", "name":""}
178  self.db_dc_scraper_db = None
179  self.DBConnector = None
180  if processorProperties is not None and "SCRAPER_TAG_ITEMS_DELIMITER" in processorProperties:
181  self.imgDelimiter = processorProperties["SCRAPER_TAG_ITEMS_DELIMITER"]
182  else:
183  self.imgDelimiter = ' '
184  self.tagsValidator = None
185  if processorProperties is not None and "tagsValidator" in processorProperties:
186  try:
187  self.tagsValidator = json.loads(processorProperties["tagsValidator"])
188  except Exception as excp:
189  ExceptionLog.handler(logger, excp, '>>> tagsValidator wronj json format', (), \
190  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
191 
192 
193  def __str__(self):
194  return "%s" % (self.name)
195 
196 
197  def __repr__(self):
198  return repr((self.name, self.rank))
199 
200 
201  # #loadScraperProperties
202  # loadScraperProperties loads scraper propeties from json file
203  #
204  # @param scraperPropFileName properties file name
205  def loadScraperProperties(self, scraperPropFileName):
206  if scraperPropFileName is not None:
207  try:
208  with open(scraperPropFileName, "rb") as fd:
209  scraperProperies = json.loads(fd.read())
210  self.properties = scraperProperies[self.__class__.__name__][CONSTS.PROPERTIES_KEY]
211  except Exception as excp:
212  logger.debug(">>> Some error with scraper property loads = " + str(excp))
213 
214 
215  # # isTagNotFilled
216  #
217  def isTagNotFilled(self, result, tagName):
218  ret = True
219  if tagName in result.tags:
220  if isinstance(result.tags[tagName], basestring):
221  ret = (result.tags[tagName].strip() == "")
222  elif isinstance(result.tags[tagName], list):
223  if len(result.tags[tagName]) > 0:
224  ret = False
225  elif isinstance(result.tags[tagName], dict):
226  if "data" in result.tags[tagName]:
227  if isinstance(result.tags[tagName]["data"], basestring):
228  ret = (result.tags[tagName]["data"].strip() == "")
229  elif isinstance(result.tags[tagName]["data"], list):
230  for elem in result.tags[tagName]["data"]:
231  ret = (elem.strip() == "")
232  if not ret:
233  break
234 
235  return ret
236 
237 
238  # # isTagValueNotEmpty
239  #
240  def isTagValueNotEmpty(self, tagValue):
241  full = None
242  if isinstance(tagValue, list):
243  if len(tagValue) == 0:
244  full = None
245  else:
246  full = tagValue
247  else:
248  full = tagValue
249  return full
250 
251 
252  # # tagValueElemValidate
253  #
254  def tagValueElemValidate(self, tagValueElem, conditionElem):
255  ret = True
256  if conditionElem["type"] == "include":
257  ret = False
258  if re.compile(conditionElem["RE"]).match(tagValueElem) is not None:
259  ret = True
260  elif conditionElem["type"] == "exclude":
261  if re.compile(conditionElem["RE"]).match(tagValueElem) is not None:
262  ret = False
263  return ret
264 
265 
266  # # tagValueValidate
267  #
268  def tagValueValidate(self, tagName, tagValue):
269  ret = tagValue
270  if self.tagsValidator is not None and self.name in self.tagsValidator and tagName in self.tagsValidator[self.name]:
271  try:
272  if isinstance(tagValue, list):
273  ret = []
274  for elem in tagValue:
275  if self.tagValueElemValidate(elem, self.tagsValidator[self.name][tagName]):
276  ret.append(elem)
277  if len(ret) == 0:
278  ret = None
279  elif isinstance(tagValue, basestring):
280  if not self.tagValueElemValidate(tagValue, self.tagsValidator[self.name][tagName]):
281  ret = None
282  except Exception as excp:
283  ExceptionLog.handler(logger, excp, '>>> something wrong in tagValueValidate method', (), \
284  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
285  return ret
286 
287 
288  # # addTag
289  #
290  def addTag(self, result, tag_name, tag_value, xpath="", isDefaultTag=False, callAdjustment=True, tagType=None,
291  allowNotFilled=False):
292  ret = False
293  if tag_name not in result.blockedByXpathTags:
294  tag_value = self.tagValueValidate(tag_name, tag_value)
295  if tag_value is not None:
296  if callAdjustment:
297  try:
298  if tag_value and not isinstance(tag_value, list):
299  pass
300  if tag_value and isinstance(tag_value, list):
301  pass
302  tag_value = self.tag[tag_name](tag_value)
303  except Exception as err:
304  logger.debug('No tag name in result template: %s', str(err))
305 
306  result.errorCode = 0
307  result.errorMessage = ERR_MSG_OK
308 
309  if (tag_name not in result.tags.keys() and self.isTagValueNotEmpty(tag_value) is not None) or \
310  (self.isTagNotFilled(result, tag_name) and self.isTagValueNotEmpty(tag_value) is not None) or \
311  allowNotFilled:
312  data = {"extractor": "Base extractor", "data": "", "name": ""}
313  data["data"] = tag_value
314  data["name"] = tag_name
315  data["xpath"] = xpath
316  data["type"] = tagType
317  data["lang"] = dc_processor.scraper_result.Result.TAGS_LANG_DEFAULT
318  data["lang_suffix"] = dc_processor.scraper_result.Result.TAGS_LANG_SUFFIX_DEFAULT
319  data["extractor"] = self.__class__.__name__
320  result.tags[tag_name] = data
321  if isDefaultTag and tag_name not in result.defaultTags:
322  result.defaultTags.append(tag_name)
323  ret = True
324  else:
325  logger.debug(">>> BaseExtractor.addTag, tags in break list; tag is = " + tag_name)
326  return ret
327 
328 
329  # # calculateMetrics
330  #
331  def calculateMetrics(self, response):
332  try:
333  for metric in response.metrics:
334  logger.debug("response.tags:\n%s\nmetric:\n%s", varDump(response.tags), varDump(metric))
335  metric.calculateMetricValue(response.tags)
336  except Exception, err:
337  ExceptionLog.handler(logger, err, CONSTS.MSG_ERROR_CALC_METRICS)
338  raise err
339 
340 
341  # # rankReading
342  #
343  def rankReading(self, exctractorName):
344  wasSet = False
345  if self.processorProperties is not None and exctractorName is not None and \
346  CONSTS.RANK_KEY in self.processorProperties:
347  try:
348  rankProp = json.loads(self.processorProperties)
349  if exctractorName in rankProp:
350  self.rank = rankProp[exctractorName]
351  wasSet = True
352  except Exception:
353  logger.debug(">>> Wrong json string in processorProperties[\"%s\"]", CONSTS.RANK_KEY)
354 
355  if not wasSet and self.properties is not None and CONSTS.RANK_KEY in self.properties:
356  self.rank = self.properties[CONSTS.RANK_KEY]
357 
358  logger.debug(">>> Rank is : %s", str(self.rank))
def tagValueValidate(self, tagName, tagValue)
def signal_handler(signum, frame)
def loadScraperProperties(self, scraperPropFileName)
def tagValueElemValidate(self, tagValueElem, conditionElem)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def isTagNotFilled(self, result, tagName)
def addTag(self, result, tag_name, tag_value, xpath="", isDefaultTag=False, callAdjustment=True, tagType=None, allowNotFilled=False)
Definition: join.py:1
def __init__(self, config, templ=None, domain=None, processorProperties=None)