2 HCE project, Python bindings, Distributed Tasks Manager application. 3 ScraperMultiItemsTask Class content main functional scrapering for multi items. 6 @file ScraperMultiItemsTask.py 7 @author Alexander Vybornyh <alexander.hce.cluster@gmail.com> 8 @link: http://hierarchical-cluster-engine.com/ 9 @copyright: Copyright © 2013-2015 IOIX Ukraine 10 @license: http://hierarchical-cluster-engine.com/license/ 26 import xml.sax.saxutils
28 from cement.core
import foundation
30 from dateutil
import parser
31 from scrapy.selector
import Selector
91 if not self.
etree.has_key(key):
98 self.
etree.get(key).append(copy.deepcopy(value))
105 def addDoc(self, key, value, join, isExtract, mandatory):
106 if not self.
docs.has_key(key):
112 self.
docs.get(key).append(copy.deepcopy(value))
113 self.
join.get(key).append(copy.deepcopy(join))
114 self.
isExtract.get(key).append(copy.deepcopy(isExtract))
115 self.
mandatory.get(key).append(copy.deepcopy(mandatory))
125 for key
in inDict.keys():
126 count = max(count, len(inDict.get(key)))
139 for key
in self.
docs.keys():
140 size = len(self.
docs.get(key))
154 length = min(len(lhs), len(rhs))
160 for i
in range(length):
161 if isinstance(lhs[i], str)
and isinstance(rhs[i], str)
and lhs[i] != rhs[i]:
169 if isinstance(lhs[i], tuple)
and isinstance(rhs[i], tuple)
and len(lhs[i]) == len(rhs[i]):
170 for j
in range(len(lhs[i])):
172 if lhs[i][j] != rhs[i][j]:
194 for key
in etree.keys():
195 pathList.extend(etree.get(key))
197 for index
in range(len(pathList) - 1):
198 commonPath = self.
getCommonPath(pathList[index], pathList[index + 1], logger)
200 if pathDict.has_key(str(commonPath)):
201 commonPathCount = int(pathDict.get(str(commonPath))[1])
203 pathDict[str(commonPath)] = (commonPath, commonPathCount + 1)
206 for elem
in pathDict.values():
207 localpathList.append(elem)
209 localpathList.sort(key=
lambda tup: tup[1], reverse=
True)
210 if len(localpathList) > 0:
211 ret = (localpathList[0])[0]
222 elementPath = copy.deepcopy(elemPath)
223 length = min(len(indexPath), len(elementPath))
225 if logger
is not None:
226 logger.debug(
'\n>>> indexPath: ' + str(indexPath))
227 logger.debug(
'\n>>> elementPath: ' + str(elementPath))
229 for i
in range(length):
230 if isinstance(indexPath[i], str)
and isinstance(elementPath[i], str)
and indexPath[i] != elementPath[i]:
231 if logger
is not None:
232 logger.debug(
"Both have type 'str' and indexPath[" + str(i) +
"] != elementPath[" + str(i) +
"]")
235 if isinstance(indexPath[i], tuple)
and isinstance(elementPath[i], tuple):
236 size = min(len(indexPath[i]), len(elementPath[i]))
237 for j
in range(size):
238 if indexPath[i][j] != elementPath[i][j]:
239 if logger
is not None:
240 logger.debug(
"Both have type 'tuple' and indexPath[" + str(i) +
"][" + str(j) +
"] != elementPath[" + \
241 str(i) +
"][" + str(j) +
"]")
244 if len(elementPath) > len(indexPath):
245 if logger
is not None:
246 logger.debug(
'type(elementPath[len(indexPath)])) = ' + str(
type(elementPath[len(indexPath)])) + \
247 ' elementPath[' + str(len(indexPath)) +
']: ' + str(elementPath[len(indexPath)]))
249 if isinstance(elementPath[len(indexPath)], tuple):
250 if len(elementPath[len(indexPath)]) > 1:
251 if logger
is not None:
252 logger.debug(
'>>> elementPath[' + str(len(indexPath)) +
'][1] = ' + str(elementPath[len(indexPath)][1]))
254 return elementPath[len(indexPath)][1]
271 if logger
is not None:
272 logger.info(
'Calculated indexPath: ' + str(indexPath))
274 if logger
is not None:
275 for key
in self.
etree:
276 logger.debug(
'len(self.etree.get(' + str(key) +
') = ' + str(len(self.
etree.get(key))))
277 for key
in self.
docs:
278 logger.debug(
'len(self.docs.get(' + str(key) +
') = ' + str(len(self.
docs.get(key))))
283 resultList.append(localRes)
285 if logger
is not None:
286 logger.debug(
'count = ' + str(count))
287 logger.debug(
'len(resultList) = ' + str(len(resultList)))
289 for key
in self.
docs.keys():
290 for index
in range(len(self.
docs.get(key))):
291 if logger
is not None:
292 logger.debug(
'==== key: ' + str(key) +
' index: ' + str(index) +
' ====')
294 if len(self.
etree.get(key)) > index:
296 if logger
is not None:
297 logger.debug(
'number = ' + str(number) +
' self.docs.get(' + str(key) +
')[' + str(index) +
'].tags: ' + \
300 if int(number) > 0
and int(number) <= len(self.
docs.get(key)):
301 if resultList[int(number) - 1].tags.has_key(key):
302 result = self.
updateTagValue(resultList[int(number) - 1], self.
docs.get(key)[index].tags, key)
303 resultList[int(number) - 1].tags.update(result.tags)
305 resultList[int(number) - 1].tags.update({key:self.
docs.get(key)[index].tags[key]})
307 if logger
is not None:
308 logger.debug(
"resultList[" + str(int(number) - 1) +
"].tags.update({" + str(key) +
":self.docs.get(" + \
309 str(key) +
")[" + str(index) +
"].tags[" + str(key) +
"]})")
311 for index
in range(0, len(resultList)):
314 for key
in self.
docs.keys():
315 if not resultList[index].tags.has_key(key)
and bool(mandatoryTags[key])
is True:
319 if resultList[index].tags.has_key(key):
320 countSelected = countSelected + 1
322 if countSelected == 0:
326 resTags.append(resultList[index])
328 if len(resTags) == 0:
342 data = {
"extractor":
"Base extractor",
"data":
"",
"name":
""}
343 data[
"data"] = [result.tags[tag_name][
"data"][0] + tags[tag_name][
"data"][0]]
344 data[
"name"] = result.tags[tag_name][
"name"]
345 data[
"xpath"] = result.tags[tag_name][
"xpath"]
346 data[
"type"] = result.tags[tag_name][
"type"]
347 data[
"extractor"] = result.tags[tag_name][
"extractor"]
348 result.tags[tag_name] = data
362 resTags = self.
getAllTags(mandatoryTags, logger)
367 if len(tagsNames) > 0:
370 for index
in range(count):
371 if len(self.
join.get(key)) > index
and \
372 len(self.
isExtract.get(key)) > index
and \
374 resDocs.append({
"obj": resTags[index],
375 "join": self.
join.get(key)[index],
376 "isExtract": self.
isExtract.get(key)[index],
377 "mandatory": self.
mandatory.get(key)[index],
378 CONSTS.TAG_ORDER_NUMBER: len(resDocs) + 1})
390 MSG_ERROR_PARSE_CMD_PARAMS =
"Error parse command line parameters." 391 MSG_ERROR_EMPTY_CONFIG_FILE_NAME =
"Config file name is empty." 392 MSG_ERROR_WRONG_CONFIG_FILE_NAME =
"Config file name is wrong" 394 MSG_ERROR_LOAD_PROPERTIES_FROM_FILE =
"Error load Scraper multi items properties from file" 395 MSG_ERROR_LOAD_APP_CONFIG =
"Error loading application config file." 396 MSG_ERROR_READ_LOG_CONFIG =
"Error read log config file." 398 MSG_ERROR_READ_INPUT_DATA =
"Error read input data from stdin." 399 MSG_ERROR_INPUT_DATA_NONE =
"Input data is none" 400 MSG_ERROR_INPUT_DATA_WITHOUT_BATCH =
"Input data without batch item." 401 MSG_ERROR_INPUT_DATA_WITHOUT_PROPERTIES =
"Input data has batch item without 'properties'." 402 MSG_ERROR_GET_PROPERTIES =
"Error getting properties from input data" 404 MSG_ERROR_LOAD_EXTRACTORS =
"Error load extractors " 405 MSG_ERROR_ADJUST_PR =
"Error adjust partial references. " 406 MSG_ERROR_ADJUST_PUBDATE =
"Error adjust PUBDATE. " 407 MSG_ERROR_ADJUST_TITLE =
"Error adjust title. " 408 MSG_ERROR_ADJUST_LINK_URL =
"Error adjust link URL. " 412 SCRAPER_MULTI_ITEMS_OPTION_LOG =
"log" 413 SCRAPER_MULTI_ITEMS_OPTION_PROPERTY_JSON_FILE =
"property_file_name" 416 ENV_SCRAPER_STORE_PATH =
"self.ENV_SCRAPER_STORE_PATH" 417 EXTENDED_NEWS_TAGS = {
"description": [
"//meta[@name='description']//@content"]}
418 DATA_NEWS_TAGS = [CONSTS.TAG_DC_DATE]
421 TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]
422 OPTION_SECTION_DATETIME_TEMPLATE_TYPES =
'tags_datetime_template_types' 426 label = CONSTS.SCRAPER_MULTI_ITEMS_APP_CLASS_NAME
432 def __init__(self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None):
433 if usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
436 Scraper.__init__(self)
438 self.exitCode = APP_CONSTS.EXIT_SUCCESS
439 self.usageModel = usageModel
440 self.configFile = configFile
442 self.input_data = inputData
444 self.outputFormat =
None 445 self.output_data =
None 446 self.extractor =
None 450 self.errorMask = APP_CONSTS.ERROR_OK
451 self.xpathSplitString =
' ' 452 self.useCurrentYear = 0
453 self.datetimeTemplateTypes = []
454 self.dbWrapper =
None 455 self.mediaLimitsHandler =
None 460 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
462 foundation.CementApp.setup(self)
466 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
468 foundation.CementApp.run(self)
470 self.input_data = self.__getInputData()
473 config = self.__initApp(self.configFile)
477 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
479 self.logger.info(APP_CONSTS.LOGGER_DELIMITER_LINE)
488 if configName
is None:
489 configName = self.pargs.config
493 config, confLogFileName, scraperPropertyFileName = self.__loadAppConfig(configName)
495 self.properties = self.__loadScraperProperties(scraperPropertyFileName)
497 if self.logger
is None:
498 self.__loadLogConfig(confLogFileName)
502 self.logger.info(
'self.properties: ' +
varDump(self.properties))
514 if scraperPropertyFileName
is not None:
516 with open(scraperPropertyFileName,
"rb")
as fd:
517 scraperProperies = json.loads(fd.read())
518 properties = scraperProperies[self.__class__.__name__][CONSTS.PROPERTIES_KEY]
519 except Exception, err:
520 if self.logger
is not None:
521 self.logger.
error(self.MSG_ERROR_LOAD_PROPERTIES_FROM_FILE +
" '" + \
522 str(scraperPropertyFileName) +
"': " + str(err))
536 scraperPropertyFileName =
'' 538 if configName
is None or configName ==
"":
539 raise Exception(self.MSG_ERROR_EMPTY_CONFIG_FILE_NAME)
541 config = ConfigParser.ConfigParser()
542 config.optionxform = str
544 readOk = config.read(configName)
547 raise Exception(self.MSG_ERROR_WRONG_CONFIG_FILE_NAME +
": " + configName)
549 if config.has_section(APP_CONSTS.CONFIG_APPLICATION_SECTION_NAME):
550 confLogFileName = Utils.getConfigParameter(config, APP_CONSTS.CONFIG_APPLICATION_SECTION_NAME, \
551 self.SCRAPER_MULTI_ITEMS_OPTION_LOG,
'')
553 scraperPropertyFileName = Utils.getConfigParameter(config, APP_CONSTS.CONFIG_APPLICATION_SECTION_NAME, \
554 self.SCRAPER_MULTI_ITEMS_OPTION_PROPERTY_JSON_FILE,
'')
556 self.useCurrentYear = config.getint(
"DateTimeType",
"useCurrentYear")
558 if config.has_section(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
559 self.datetimeTemplateTypes = []
560 for key, value
in config.items(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
561 self.datetimeTemplateTypes.append(key)
562 if self.logger
is not None:
563 self.logger.debug(
'load form config: ' + str(key) +
' = ' + str(value))
565 self.datetimeTemplateTypes = self.TAGS_DATETIME_TEMPLATE_TYPES
566 if self.logger
is not None:
567 self.logger.debug(
"Config file hasn't section: " + str(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES))
570 dbTaskIniConfigFileName = config.get(self.__class__.__name__,
"db-task_ini")
571 readOk = config.read(dbTaskIniConfigFileName)
573 raise Exception(self.MSG_ERROR_WRONG_CONFIG_FILE_NAME +
": " + dbTaskIniConfigFileName)
575 except Exception, err:
576 raise Exception(self.MSG_ERROR_LOAD_APP_CONFIG +
' ' + str(err))
578 return config, confLogFileName, scraperPropertyFileName
587 if isinstance(configName, str)
and len(configName) == 0:
588 raise Exception(self.MSG_ERROR_EMPTY_CONFIG_FILE_NAME)
590 logging.config.fileConfig(configName)
593 self.logger = Utils.MPLogger().
getLogger()
595 except Exception, err:
596 raise Exception(self.MSG_ERROR_READ_LOG_CONFIG +
' ' + str(err))
604 for extractor
in self.extractors:
605 if extractor.__class__.__name__ == extractorName:
622 scraperInputData =
None 625 scraperInputData = pickle.loads(sys.stdin.read())
626 except Exception, err:
627 if self.logger
is not None:
628 ExceptionLog.handler(self.logger, err, self.MSG_ERROR_READ_INPUT_DATA)
631 raise Exception(self.MSG_ERROR_READ_INPUT_DATA +
' ' + str(err))
633 return scraperInputData
642 if inputData
is None:
643 raise Exception(self.MSG_ERROR_INPUT_DATA_NONE)
645 if inputData.batch_item
is None:
646 raise Exception(self.MSG_ERROR_INPUT_DATA_WITHOUT_BATCH)
648 if inputData.batch_item.properties
is None:
649 raise Exception(self.MSG_ERROR_INPUT_DATA_WITHOUT_PROPERTIES)
659 if inputData.batch_item.urlObj
is not None:
660 urlString = inputData.batch_item.urlObj.url
663 logMsg =
"BatchItem.siteId=" + str(inputData.batch_item.siteId) + \
664 ", BatchItem.urlId=" + str(inputData.batch_item.urlId) + \
665 ", BatchItem.urlObj.url=" + urlString
666 app.Profiler.messagesList.append(logMsg)
667 self.logger.info(
"Incoming data: %s", logMsg)
678 if inputData.output_format
is not None and "name" in inputData.output_format:
679 outputFormat = inputData.output_format[
"name"]
681 if outputFormat
is None and "templates" in inputData.batch_item.properties[
"template"]
and \
682 len(inputData.batch_item.properties[
"template"][
"templates"]) > 0
and \
683 "output_format" in inputData.batch_item.properties[
"template"][
"templates"][0]
and \
684 "name" in inputData.batch_item.properties[
"template"][
"templates"][0][
"output_format"]:
685 outputFormat = inputData.batch_item.properties[
"template"][
"templates"][0][
"output_format"][
"name"]
687 self.logger.debug(
">>> 'output_format' hasn't in template of input batch.")
699 if "TAGS_MAPPING" in inputData.batch_item.properties
and \
700 inputData.batch_item.properties[
"TAGS_MAPPING"]
is not None:
702 altTagsMask = json.loads(inputData.batch_item.properties[
"TAGS_MAPPING"])
703 self.logger.debug(
">>> AltTagsMask = " + str(altTagsMask))
704 except Exception, err:
705 ExceptionLog.handler(self.logger, err,
'Bad TAGS_MAPPING properties value:', \
706 (inputData.batch_item.properties[
"TAGS_MAPPING"]), \
707 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
721 if (self.input_data
is not None)
and \
722 inputData.processor_properties
is not None:
723 processor_properties = inputData.processor_properties
724 self.logger.debug(
"Processor's properties was taken from input data: %s" % processor_properties)
725 self.logger.debug(
"Processor's properties type: %s" % str(
type(processor_properties)))
726 if not isinstance(processor_properties, dict):
727 processor_properties = json.loads(inputData.processor_properties)
728 self.logger.debug(
"Processor's properties was taken from input data: %s" % processor_properties)
729 properties = processor_properties
731 self.logger.debug(
'>>> inputData.batch_item.properties: ' +
varDump(inputData.batch_item.properties) + \
732 ' type: ' + str(
type(inputData.batch_item.properties)))
733 if isinstance(inputData.batch_item.properties, dict):
734 properties.update(inputData.batch_item.properties)
736 except Exception, err:
737 ExceptionLog.handler(self.logger, err, self.MSG_ERROR_GET_PROPERTIES, (inputData.processor_properties))
753 modules = self.properties[CONSTS.MODULES_KEY][algorithmName]
755 self.logger.debug(
"Algorithm name: <%s>" % (algorithmName))
756 self.logger.debug(
"Modules: %s" % modules)
758 for module
in modules:
759 exrtactor = self.__createModule(module, config, urlHost)
761 if exrtactor
is not None:
762 extractors.append(exrtactor)
765 self.logger.debug(
"*******************")
766 self.logger.debug(
"Loaded extractors:")
767 for extractor
in extractors:
768 self.logger.debug(extractor.name)
769 self.logger.debug(
"*******************")
771 except Exception, err:
772 ExceptionLog.handler(self.logger, err, self.MSG_ERROR_LOAD_EXTRACTORS)
773 raise Exception(self.MSG_ERROR_LOAD_EXTRACTORS +
' ' + str(err))
788 appInst = (moduleName, eval(moduleName)(config,
None, urlHost))[1]
789 self.logger.debug(
"%s has been created!" % moduleName)
790 except Exception, err:
791 ExceptionLog.handler(self.logger, err,
"Can't create module %s. Error is:" % (moduleName))
1092 for key
in response.tags:
1093 if key
in self.DATA_NEWS_TAGS:
1096 if isinstance(response.tags[key], str)
or isinstance(response.tags[key], unicode):
1097 tagsValue = response.tags[key]
1098 elif isinstance(response.tags[key], dict)
and "data" in response.tags[key]:
1099 if isinstance(response.tags[key][
"data"], str)
or isinstance(response.tags[key][
"data"], unicode):
1100 tagsValue = response.tags[key][
"data"]
1101 elif isinstance(response.tags[key][
"data"], list)
and len(response.tags[key][
"data"]) > 0
and \
1102 isinstance(response.tags[key][
"data"][0], str)
or isinstance(response.tags[key][
"data"][0], unicode):
1103 tagsValue = response.tags[key][
"data"][0]
1105 if tagsValue
is not None:
1107 dt = parser.parse(tagsValue)
1108 int(time.mktime(dt.timetuple()))
1110 removeKeys.append(key)
1112 for key
in removeKeys:
1113 if key
in response.tags:
1114 logging.debug(
">>> Remove " + key +
" element besause it empty")
1115 del response.tags[key]
1119 self.logger.debug(
'>>> preparseResponse enter <<<')
1121 for key
in response.tags:
1122 if response.tags[key]
is not None:
1123 if "data" in response.tags[key]:
1124 if isinstance(response.tags[key][
"data"], str)
or isinstance(response.tags[key][
"data"], unicode):
1125 localStr = response.tags[key][
"data"]
1127 self.logger.debug(
'-----------------------------------------')
1128 self.logger.debug(
'key: ' + str(key) +
' => ' + str(localStr))
1129 self.logger.debug(
'-----------------------------------------')
1131 response.tags[key][
"data"] = []
1132 response.tags[key][
"data"].append(localStr)
1134 self.logger.debug(
'response.tags[key]["data"]: ' + str(response.tags[key][
"data"]))
1135 self.logger.debug(
'-----------------------------------------')
1140 if localOutputFormat ==
"json":
1142 localStr = json.dumps(elem, ensure_ascii=
False)
1143 if localStr[0] ==
'\"' or localStr[0] ==
'\'':
1144 localStr = localStr[1:]
1145 if localStr[-1] ==
'\"' or localStr[-1] ==
'\'':
1146 localStr = localStr[0:-1]
1149 elif localOutputFormat ==
"html":
1150 ret = xml.sax.saxutils.escape(elem, {
"'":
"'",
"\"" :
"""})
1151 elif localOutputFormat ==
"sql":
1153 ret = Utils.escape(elem)
1159 for key
in response.tags:
1160 if response.tags[key]
is not None:
1161 if "data" in response.tags[key]:
1162 if isinstance(response.tags[key][
"data"], list):
1163 for i, elem
in enumerate(response.tags[key][
"data"]):
1164 response.tags[key][
"data"][i] = self.formatOutpuElement(elem, localOutputFormat)
1165 elif isinstance(response.tags[key][
"data"], str)
or isinstance(response.tags[key][
"data"], unicode):
1166 response.tags[key][
"data"] = self.formatOutpuElement(response.tags[key][
"data"], localOutputFormat)
1175 self.extractor =
ScrapyExtractor(config, self.input_data.template, urlHost)
1176 sel = Selector(text=self.input_data.raw_content)
1177 if isinstance(self.input_data.template, dict):
1178 template = self.input_data.template
1186 for key, value
in template.items():
1188 self.logger.debug(
">>> Calculate mandatory for '" + str(key) +
"'")
1190 self.logger.debug(
">>> mandatory = " + str(elem[
"mandatory"]) +
" type: " + str(
type(elem[
"mandatory"])))
1191 if bool(elem[
"mandatory"])
is False:
1195 mandatoryTags[key] = isMandatory
1197 self.logger.debug(
">>> Calculated mandatoryTags: " +
varDump(mandatoryTags))
1202 for key
in template:
1203 self.logger.debug(
">>> Template key: " + key)
1204 if "state" in template[key]
and not bool(int(template[key][
"state"])):
1205 self.logger.debug(
">>> Template disable: template name = " + str(key))
1207 for path
in template[key]:
1208 if not isinstance(path, dict):
1209 self.logger.debug(
">>> WARNING path not DICT type ")
1213 localResult =
Result(
None, self.input_data.urlId)
1219 self.logger.debug(
">>> Logging xPath trees for key: '" + str(key) +
"'")
1220 etrees = sel.xpath(path[
'target'])
1221 for etree
in etrees:
1223 self.logger.debug(
">>> etree: " +
varDump(etree))
1224 if isinstance(etree._root, basestring):
1227 etreeValue = self.get_path(etree._root)
1228 self.logger.debug(
'>>> etreeValue: ' +
varDump(etreeValue))
1229 scraperDocs.addEtree(key, copy.deepcopy(etreeValue))
1233 if CONSTS.TAG_MARKUP_PROP_NAME
in self.properties
else None)
1235 self.logger.debug(
">>> xPathPreparing: " +
varDump(xPathPreparing))
1236 self.logger.debug(
">>> path: " +
varDump(path))
1237 self.logger.debug(
">>> sel: " +
varDump(sel))
1239 self.logger.debug(
">>> self.properties: " +
varDump(self.properties))
1241 self.xpathSplitString = xPathPreparing.resolveDelimiter(path, self.properties, self.xpathSplitString)
1242 innerDelimiter = xPathPreparing.resolveInnerDelimiter(path, self.properties)
1243 self.logger.debug(
">>> xpathSplitString: '" + str(self.xpathSplitString) +
"'")
1244 self.logger.debug(
">>> innerDelimiter: '" + str(innerDelimiter) +
"'")
1246 xpath, xpathValue = xPathPreparing.process(path, sel, self.xpathSplitString, innerDelimiter,
1247 Utils.innerTextToList)
1248 except Exception, err:
1249 ExceptionLog.handler(self.logger, err,
"some rule/xpath exception:", (), \
1250 {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
1253 self.logger.debug(
">>> xpathValue " + str(
type(xpathValue)) +
" " + str(xpathValue))
1254 self.logger.debug(
">>> xpath: %s" % str(xpath))
1255 if (isinstance(xpathValue, list)
and len(xpathValue) == 0)
or\
1256 (isinstance(xpathValue, basestring)
and xpathValue ==
''):
1257 self.logger.debug(
">>> set default xpathValue")
1259 xpathValue.append(path[
"default"])
1262 if not isinstance(xpathValue, list):
1263 xpathValue = [xpathValue]
1265 for xpathElem
in xpathValue:
1266 elemResult = copy.deepcopy(localResult)
1267 self.logger.debug(
"result before:\n%s",
varDump(elemResult))
1268 self.extractor.addTag(result=elemResult, tag_name=key, tag_value=xpathElem, xpath=xpath,
1269 isDefaultTag=(
not isExtract), callAdjustment=
False, tagType=path[
"type"],
1270 allowNotFilled=
True)
1272 self.logger.debug(
"result after:\n%s",
varDump(elemResult))
1274 self.logger.debug(
">>> tag type = " + str(
type(elemResult.tags)))
1275 self.logger.debug(
">>> tags data type = " + str(
type(elemResult.tags[key][
"data"])))
1277 if key
in elemResult.tags
and isinstance(elemResult.tags[key][
"data"], basestring):
1278 self.logger.debug(
">>> Convert result = " + str(key))
1279 localString = elemResult.tags[key][
"data"]
1280 elemResult.tags[key][
"data"] = []
1281 elemResult.tags[key][
"data"].append(localString)
1283 if isExtract
and "postProcessing" in path
and path[
"postProcessing"]
is not None and \
1284 path[
"postProcessing"] !=
"":
1285 self.applyPostProcessing(elemResult, key, path[
"postProcessing"])
1288 self.logger.debug(
"scraperDocs.addDoc key: " + str(key) +
' mandatory = ' +
varDump(mandatoryTags[key]))
1290 scraperDocs.addDoc(key, elemResult, path[
"join"], isExtract,
1291 (bool(path[
"mandatory"])
if "mandatory" in path
else False))
1295 resultDocs = scraperDocs.getAllDocs(mandatoryTags, self.logger)
1297 for elem
in resultDocs:
1298 result =
Result(
None, self.input_data.urlId)
1300 self.addCustomTag(result=result, tag_name=CONSTS.TAG_ORDER_NUMBER, \
1301 tag_value=str(elem[CONSTS.TAG_ORDER_NUMBER]))
1303 self.addCustomTag(result=result, tag_name=CONSTS.TAG_SOURCE_URL, \
1304 tag_value=[self.input_data.url])
1307 prepareResultsList = self.prepareResults([elem])
1308 self.compileResults(result, prepareResultsList, key, xPathPreparing)
1309 result.finish = time.time()
1310 resultsList.append(copy.deepcopy(result))
1393 if key
in result.tags
and "data" in result.tags[key]
and result.tags[key][
"data"]
is not None and \
1394 len(result.tags[key][
"data"]) > 0:
1396 matchingVal = re.compile(postProcessingRE)
1397 except re.error
as err:
1398 self.logger.debug(
">>> RE error = " + str(err))
1399 self.errorMask = self.errorMask | APP_CONSTS.ERROR_RE_ERROR
1402 matchingResult = matchingVal.findall(result.tags[key][
"data"][0])
1403 if matchingResult
is not None:
1404 for elem
in matchingResult:
1405 if isinstance(elem, str)
or isinstance(elem, unicode):
1409 for innerElem
in elem:
1410 if innerElem
is not None and innerElem !=
'':
1411 tmpStr += str(innerElem)
1413 tmpStr = tmpStr.strip()
1415 self.logger.debug(
">>> Replace value, prev. value is = " + result.tags[key][
"data"][0])
1416 result.tags[key][
"data"][0] = tmpStr
1419 result.tags[key][
"data"][0] =
None 1424 processedContent = {}
1425 processedContent[
"default"] = result
1426 processedContent[
"internal"] = [result]
1427 processedContent[
"custom"] = []
1429 if "pubdate" in result.tags
and "data" in result.tags[
"pubdate"]
and \
1430 len(result.tags[
"pubdate"][
"data"]) > 0:
1431 self.pubdate = result.tags[
"pubdate"][
"data"]
1432 self.logger.debug(
'>>>> Set self.pubdate = ' + str(self.pubdate))
1434 return processedContent
1458 self.__checkInputData(self.input_data)
1460 self.logger.info(
'Start processing on BatchItem from Batch: ' + str(self.input_data.batchId))
1463 self.__fillProfilerMessageList(self.input_data)
1464 self.logger.debug(
"self.inputData:\n%s",
varDump(self.input_data))
1467 self.outputFormat = self.__getOutputFormat(self.input_data)
1470 altTagsMask = self.__getAltTagsMask(self.input_data)
1473 properties = self.__getPropertiesFromInputData(self.input_data)
1474 if properties
is not None:
1475 self.properties = properties
1477 algorithmName = self.properties[CONSTS.ALGORITHM_KEY][CONSTS.ALGORITHM_NAME_KEY]
1479 self.logger.debug(
"Algorithm : %s" % algorithmName)
1480 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1481 Utils.storePickleOnDisk(self.input_data, self.ENV_SCRAPER_STORE_PATH,
"scraper.in." + \
1482 str(self.input_data.urlId))
1485 sys.stdout = open(
"/dev/null",
"wb")
1491 urlHost = self.calcUrlDomainCrc(self.input_data.url)
1492 self.logger.info(
'urlHost: ' + str(urlHost))
1494 self.extractors = self.__loadExtractors(algorithmName, config, urlHost)
1498 self.logger.info(
"input_data url: %s, urlId: %s, siteId: %s", str(self.input_data.url), str(self.input_data.urlId),
1499 str(self.input_data.siteId))
1507 self.itr = iter(sorted(self.extractors, key=
lambda extractor: extractor.rank, reverse=
True))
1508 self.logger.debug(
"Extractors: %s" %
varDump(self.itr))
1511 responses = self.templateExtraction(config, urlHost)
1513 if CONSTS.MEDIA_LIMITS_NAME
in self.input_data.batch_item.properties:
1514 self.logger.debug(
"Found property '%s'", str(CONSTS.MEDIA_LIMITS_NAME))
1515 self.mediaLimitsHandler =
MediaLimitsHandler(self.input_data.batch_item.properties[CONSTS.MEDIA_LIMITS_NAME])
1518 scraperResponseList = []
1519 for response
in responses:
1520 if response
is not None:
1521 response.stripResult()
1524 if algorithmName != CONSTS.PROCESS_ALGORITHM_REGULAR:
1525 self.adjustTitle(response)
1526 self.adjustLinkURL(response)
1527 self.adjustPartialReferences(response)
1528 self.logger.debug(
"PDate: %s" % str(self.input_data.batch_item.urlObj.pDate))
1529 self.logger.debug(
"PDate type: %s" % str(
type(self.input_data.batch_item.urlObj.pDate)))
1532 self.preparseResponse(response)
1534 self.logger.debug(
'>>>>> self.properties = ' +
varDump(self.properties))
1538 pdateSourceMask = APP_CONSTS.PDATE_SOURCES_MASK_BIT_DEFAULT
1539 pdateSourceMaskOverwrite = APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_DEFAULT
1542 if APP_CONSTS.PDATE_SOURCES_MASK_PROP_NAME
in self.input_data.batch_item.properties:
1543 pdateSourceMask = int(self.input_data.batch_item.properties[APP_CONSTS.PDATE_SOURCES_MASK_PROP_NAME])
1546 if APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_PROP_NAME
in self.input_data.batch_item.properties:
1547 pdateSourceMaskOverwrite = \
1548 int(self.input_data.batch_item.properties[APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_PROP_NAME])
1550 self.logger.debug(
'pdateSourceMask = %s, pdateSourceMaskOverwrite = %s',
1551 str(pdateSourceMask), str(pdateSourceMaskOverwrite))
1553 self.logger.debug(
"!!! self.input_data.batch_item.urlObj.pDate = " + str(self.input_data.batch_item.urlObj.pDate))
1557 if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED:
1558 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED)
or \
1559 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED:
1560 self.pubdate, timezone = self.extractPubdateRssFeed(self.input_data.siteId, self.input_data.url)
1563 if CONSTS.TAG_DC_DATE
in response.tags
and pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE:
1564 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE
and self.pubdate
is None)
or \
1565 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE:
1566 if CONSTS.TAG_PUB_DATE
not in response.tags
or \
1567 (isinstance(response.tags[CONSTS.TAG_PUB_DATE][
"data"], basestring)
and \
1568 response.tags[CONSTS.TAG_PUB_DATE][
"data"].strip() ==
""):
1569 response.tags[CONSTS.TAG_PUB_DATE] = copy.deepcopy(response.tags[CONSTS.TAG_DC_DATE])
1570 response.tags[CONSTS.TAG_PUB_DATE][
"name"] = CONSTS.TAG_PUB_DATE
1571 if len(response.tags[CONSTS.TAG_PUB_DATE]) > 0
and response.tags[CONSTS.TAG_PUB_DATE][0]:
1572 self.pubdate = response.tags[CONSTS.TAG_PUB_DATE][0]
1573 self.logger.debug(
"Pubdate from 'dc_date': " + str(self.pubdate))
1577 if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE:
1578 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE
and self.pubdate
is None)
or \
1579 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE:
1580 pubdate, timezone = self.normalizeDatetime(response, algorithmName)
1581 if pubdate
is not None:
1582 self.pubdate = pubdate
1583 self.logger.debug(
"Pubdate from 'pubdate': " + str(self.pubdate))
1586 if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_NOW:
1587 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_NOW
and self.pubdate
is None)
or \
1588 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_NOW:
1590 self.logger.debug(
"Pubdate from 'SQL NOW()': " + str(self.pubdate))
1593 if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION
and \
1594 APP_CONSTS.PDATE_SOURCES_EXPRESSION_PROP_NAME
in self.properties:
1595 if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION
and self.pubdate
is None)
or \
1596 not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION:
1597 self.pubdate =
SQLExpression(str(self.properties[APP_CONSTS.PDATE_SOURCES_EXPRESSION_PROP_NAME]))
1598 self.logger.debug(
"Pubdate from 'sql expression': " + str(self.pubdate))
1601 self.pubdate = self.pubdateMonthOrder(self.pubdate, self.input_data.batch_item.properties, self.input_data.url)
1604 self.input_data.batch_item.urlObj.pDate = self.pubdate
1605 self.pubdate = FieldsSQLExpressionEvaluator.evaluatePDateTime(self.input_data.batch_item.properties,
1607 self.input_data.batch_item.urlObj,
1612 self.pubdate, timezone = self.pubdateTransform(self.pubdate,
1614 self.input_data.batch_item.properties,
1615 self.input_data.url)
1618 self.addCustomTag(result=response, tag_name=CONSTS.TAG_PUBDATE_TZ, tag_value=[timezone])
1620 if "pubdate" in response.tags
and "data" in response.tags[
"pubdate"]
and \
1621 len(response.tags[
"pubdate"][
"data"]) > 0:
1622 response.tags[
"pubdate"][
"data"][0] = self.pubdate
1624 if self.outputFormat
is None:
1625 self.logger.debug(
">>> Warning, can't extract output format")
1627 self.formatOutputData(response, self.outputFormat)
1629 response.recalcTagMaskCount(
None, altTagsMask)
1631 self.logger.debug(
"response.tagsCount: " + str(response.tagsCount) + \
1632 " response.tagsMasks: " + str(response.tagsMask) + \
1633 "\n>>> Resp: " +
varDump(response))
1637 if len(responses) > 0:
1638 startTime = responses[0].start
1640 finishTime = time.time()
1642 for response
in responses:
1643 response.start = startTime
1644 response.finish = finishTime
1645 response.data[
"time"] =
"%s" % str(finishTime - startTime)
1647 response = self.applyHTTPRedirectLink(self.input_data.batch_item.siteId, self.input_data.batch_item.urlObj.url,
1648 self.input_data.batch_item.properties, response)
1651 processedContent = self.getProcessedContent(response)
1652 scraperResponseList.append(
ScraperResponse(response.tagsCount, response.tagsMask, self.pubdate, \
1653 processedContent, self.errorMask))
1655 self.logger.debug(
'len(scraperResponseList): ' +
varDump(len(scraperResponseList)))
1656 self.logger.debug(
'maxURLsFromPage: ' + str(self.input_data.batch_item.urlObj.maxURLsFromPage))
1659 if self.input_data.batch_item.urlObj.maxURLsFromPage
is not None and \
1660 int(self.input_data.batch_item.urlObj.maxURLsFromPage) > 0
and \
1661 int(self.input_data.batch_item.urlObj.maxURLsFromPage) < len(scraperResponseList):
1662 self.logger.debug(
'>>> scraperResponseList 1')
1663 scraperResponseList = scraperResponseList[0: int(self.input_data.batch_item.urlObj.maxURLsFromPage)]
1664 self.logger.debug(
'>>> scraperResponseList 2')
1665 scraperResponseList[-1].errorMask |= APP_CONSTS.ERROR_MAX_URLS_FROM_PAGE
1666 self.logger.debug(
"Truncated scraper responces list because over limit 'maxURLsFromPage' = " + \
1667 str(self.input_data.batch_item.urlObj.maxURLsFromPage) +
" set errorMask = " + \
1668 str(APP_CONSTS.ERROR_MAX_URLS_FROM_PAGE))
1674 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1675 output_pickled_object = pickle.dumps(scraperResponseList)
1676 Utils.storePickleOnDisk(output_pickled_object, self.ENV_SCRAPER_STORE_PATH,
1677 "scraper.out." + str(self.input_data.urlId))
1678 print output_pickled_object
1681 self.output_data = scraperResponseList
1682 self.logger.debug(
'self.output_data: ' + str(
varDump(self.output_data)))
1696 p = etreeElement.getparent()
1698 index = p.index(etreeElement) + 1
1699 rpath.insert(0, (etreeElement.tag, str(index)))
1700 return self.get_path(p, rpath)
1702 rpath.insert(0, etreeElement.tag)
def getAllDocs(self, mandatoryTags, logger=None)
def __loadExtractors(self, algorithmName, config, urlHost)
def calculateIndexPath(self, etree, logger=None)
def getAllTags(self, mandatoryTags, logger=None)
def __loadAppConfig(self, configName)
def __fillProfilerMessageList(self, inputData)
def __getPropertiesFromInputData(self, inputData)
def refineBadDateTags(self, response)
def getProcessedContent(self, result)
def getIndexNumberOfPath(self, indexPath, elemPath, logger=None)
def __loadScraperProperties(self, scraperPropertyFileName)
def __init__(self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None)
def __loadLogConfig(self, configName)
def addDoc(self, key, value, join, isExtract, mandatory)
def __getOutputFormat(self, inputData)
def get_path(self, etreeElement, path=None)
def preparseResponse(self, response)
def applyPostProcessing(self, result, key, postProcessingRE)
def updateTagValue(self, result, tags, tag_name)
def __checkInputData(self, inputData)
def addEtree(self, key, value)
def getMaxCount(self, inDict)
def __initApp(self, configName=None)
def __getAltTagsMask(self, inputData)
def getCommonPath(self, lhs, rhs, logger=None)
def formatOutpuElement(self, elem, localOutputFormat)
def process(self, config)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
def getTagNamesExistAllDocs(self)
def templateExtraction(self, config, urlHost)
def formatOutputData(self, response, localOutputFormat)
def getExtractorByName(self, extractorName)
def __init__(self, keys, urlId)
def __createModule(self, moduleName, config, urlHost)