2 @file base_extractor.py 3 @author Alexey, bgv <developers.hce@gmail.com> 4 @link http://hierarchical-cluster-engine.com/ 5 @copyright Copyright © 2013 IOIX Ukraine 6 @license http://hierarchical-cluster-engine.com/license/ 7 @package HCE project node API 26 logger.debug(
"Time execution limit was reached: %s seconds.", str(CONSTS.TIME_EXECUTION_LIMIT))
27 raise Exception(
"Timed out!")
31 ERR_MSG_ADJUST_PUB_DATE =
"Error in adjustPubDate: " 32 ERR_MSG_ADJUST_MEDIA =
"Error in adjustMedia: " 33 ERR_MSG_ADJUST_CONTENT_UTF8_ENCODED =
"Error in adjustContentUTF8Encoded: " 48 if isinstance(dates, list)
and len(dates):
50 pub_date =
" ".
join(dates)
53 if pub_date
and len(dates)
and not re.search(
r'\d+', pub_date):
55 except Exception
as err:
56 ExceptionLog.handler(logger, err, ERR_MSG_ADJUST_PUB_DATE)
89 if isinstance(data, list)
and len(data) > 1:
105 tag = {CONSTS.TAG_MEDIA: adjustMedia,
106 CONSTS.TAG_CONTENT_UTF8_ENCODED: adjustContentUTF8Encoded,
107 CONSTS.TAG_PUB_DATE: adjustPubDate,
108 CONSTS.TAG_TITLE: adjustNone,
109 CONSTS.TAG_LINK: adjustLink,
110 CONSTS.TAG_DESCRIPTION: adjustNone,
111 CONSTS.TAG_DC_DATE: adjustNone,
112 CONSTS.TAG_AUTHOR: adjustNone,
113 CONSTS.TAG_GUID: adjustNone,
114 CONSTS.TAG_KEYWORDS: adjustNone,
115 CONSTS.TAG_MEDIA_THUMBNAIL: adjustNone,
116 CONSTS.TAG_ENCLOSURE: adjustNone,
117 CONSTS.TAG_MEDIA_CONTENT: adjustNone,
118 CONSTS.TAG_GOOGLE: adjustNone,
119 CONSTS.TAG_GOOGLE_TOTAL: adjustNone,
120 CONSTS.HTML_LANG: adjustNone
124 tagsMask = {CONSTS.TAG_MEDIA: 1,
125 CONSTS.TAG_CONTENT_UTF8_ENCODED: 1 << 1, CONSTS.CONTENT: 1 << 1,
126 CONSTS.TAG_PUB_DATE: 1 << 2, CONSTS.PUBLISHED: 1 << 2,
127 CONSTS.TAG_TITLE: 1 << 3,
129 CONSTS.TAG_LINK: 1 << 4,
130 CONSTS.TAG_DESCRIPTION: 1 << 5,
131 CONSTS.UPDATED_PARSED: 1 << 6,
132 CONSTS.TAG_DC_DATE: 1 << 7,
134 CONSTS.TAG_AUTHOR: 1 << 8,
135 CONSTS.TAG_GUID: 1 << 9,
136 CONSTS.TAG_KEYWORDS: 1 << 10,
137 CONSTS.TAG_MEDIA_THUMBNAIL: 1 << 11,
139 CONSTS.TAG_ENCLOSURE: 1 << 12,
140 CONSTS.TAG_MEDIA_CONTENT: 1 << 13,
141 CONSTS.TAG_GOOGLE: 1 << 14,
142 CONSTS.TAG_GOOGLE_TOTAL: 1 << 15,
144 CONSTS.HTML_LANG: 1 << 16,
145 CONSTS.PARENT_RSS_FEED: 1 << 17,
146 CONSTS.PARENT_RSS_FEED_URLMD5: 1 << 18,
147 CONSTS.SUMMARY_DETAIL: 1 << 19,
149 CONSTS.SUMMARY: 1 << 20,
150 CONSTS.COMMENTNS: 1 << 21,
151 CONSTS.TAGS: 1 << 22,
152 CONSTS.UPDATED: 1 << 23,
154 CONSTS.TAG_ORDER_NUMBER: 1 << 24,
155 CONSTS.TAG_SOURCE_URL: 1 << 25
161 def __init__(self, config, templ=None, domain=None, processorProperties=None):
165 scraperPropFileName = self.
config.get(
"Application",
"property_file_name")
167 if scraperPropFileName
is not None:
171 self.
rank = CONSTS.SCRAPER_RANK_INIT
177 self.
data = {
"extractor":
"Base extractor",
"data":
"",
"name":
""}
180 if processorProperties
is not None and "SCRAPER_TAG_ITEMS_DELIMITER" in processorProperties:
185 if processorProperties
is not None and "tagsValidator" in processorProperties:
187 self.
tagsValidator = json.loads(processorProperties[
"tagsValidator"])
188 except Exception
as excp:
189 ExceptionLog.handler(logger, excp,
'>>> tagsValidator wronj json format', (), \
190 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
194 return "%s" % (self.
name)
206 if scraperPropFileName
is not None:
208 with open(scraperPropFileName,
"rb")
as fd:
209 scraperProperies = json.loads(fd.read())
210 self.
properties = scraperProperies[self.__class__.__name__][CONSTS.PROPERTIES_KEY]
211 except Exception
as excp:
212 logger.debug(
">>> Some error with scraper property loads = " + str(excp))
219 if tagName
in result.tags:
220 if isinstance(result.tags[tagName], basestring):
221 ret = (result.tags[tagName].strip() ==
"")
222 elif isinstance(result.tags[tagName], list):
223 if len(result.tags[tagName]) > 0:
225 elif isinstance(result.tags[tagName], dict):
226 if "data" in result.tags[tagName]:
227 if isinstance(result.tags[tagName][
"data"], basestring):
228 ret = (result.tags[tagName][
"data"].strip() ==
"")
229 elif isinstance(result.tags[tagName][
"data"], list):
230 for elem
in result.tags[tagName][
"data"]:
231 ret = (elem.strip() ==
"")
242 if isinstance(tagValue, list):
243 if len(tagValue) == 0:
256 if conditionElem[
"type"] ==
"include":
258 if re.compile(conditionElem[
"RE"]).match(tagValueElem)
is not None:
260 elif conditionElem[
"type"] ==
"exclude":
261 if re.compile(conditionElem[
"RE"]).match(tagValueElem)
is not None:
272 if isinstance(tagValue, list):
274 for elem
in tagValue:
279 elif isinstance(tagValue, basestring):
282 except Exception
as excp:
283 ExceptionLog.handler(logger, excp,
'>>> something wrong in tagValueValidate method', (), \
284 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
290 def addTag(self, result, tag_name, tag_value, xpath="", isDefaultTag=False, callAdjustment=True, tagType=None,
291 allowNotFilled=False):
293 if tag_name
not in result.blockedByXpathTags:
295 if tag_value
is not None:
298 if tag_value
and not isinstance(tag_value, list):
300 if tag_value
and isinstance(tag_value, list):
302 tag_value = self.
tag[tag_name](tag_value)
303 except Exception
as err:
304 logger.debug(
'No tag name in result template: %s', str(err))
307 result.errorMessage = ERR_MSG_OK
309 if (tag_name
not in result.tags.keys()
and self.
isTagValueNotEmpty(tag_value)
is not None)
or \
312 data = {
"extractor":
"Base extractor",
"data":
"",
"name":
""}
313 data[
"data"] = tag_value
314 data[
"name"] = tag_name
315 data[
"xpath"] = xpath
316 data[
"type"] = tagType
317 data[
"lang"] = dc_processor.scraper_result.Result.TAGS_LANG_DEFAULT
318 data[
"lang_suffix"] = dc_processor.scraper_result.Result.TAGS_LANG_SUFFIX_DEFAULT
319 data[
"extractor"] = self.__class__.__name__
320 result.tags[tag_name] = data
321 if isDefaultTag
and tag_name
not in result.defaultTags:
322 result.defaultTags.append(tag_name)
325 logger.debug(
">>> BaseExtractor.addTag, tags in break list; tag is = " + tag_name)
333 for metric
in response.metrics:
334 logger.debug(
"response.tags:\n%s\nmetric:\n%s",
varDump(response.tags),
varDump(metric))
335 metric.calculateMetricValue(response.tags)
336 except Exception, err:
337 ExceptionLog.handler(logger, err, CONSTS.MSG_ERROR_CALC_METRICS)
349 if exctractorName
in rankProp:
350 self.
rank = rankProp[exctractorName]
353 logger.debug(
">>> Wrong json string in processorProperties[\"%s\"]", CONSTS.RANK_KEY)
358 logger.debug(
">>> Rank is : %s", str(self.
rank))
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)