3 Created on Mar 02, 2016 7 @link: http://hierarchical-cluster-engine.com/ 8 @copyright: Copyright © 2013-2014 IOIX Ukraine 9 @license: http://hierarchical-cluster-engine.com/license/ 23 import xml.sax.saxutils
25 import cPickle
as pickle
29 from cement.core
import foundation
63 MSG_ERROR_LOAD_EXTRACTORS =
"Error load extractors " 65 ENV_SCRAPER_STORE_PATH =
"ENV_SCRAPER_STORE_PATH" 67 TAGS_DATETIME_NEWS_NAMES = [CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE]
72 MSG_ERROR_WRONG_CONFIG_FILE_NAME =
"Config file name is wrong" 74 TAGS_DATETIME_TEMPLATE_TYPES = [CONSTS.TAG_TYPE_DATETIME]
75 OPTION_SECTION_DATETIME_TEMPLATE_TYPES =
'tags_datetime_template_types' 79 label = CONSTS.SCRAPER_CUSTOM_JSON_APP_CLASS_NAME
85 def __init__(self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None):
86 if usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
89 Scraper.__init__(self)
91 self.exitCode = APP_CONSTS.EXIT_SUCCESS
92 self.usageModel = usageModel
93 self.configFile = configFile
95 self.input_data = inputData
102 self.errorMask = APP_CONSTS.ERROR_OK
103 self.scraperPropFileName =
None 104 self.algorithm_name =
None 105 self.scraperResponses = []
109 self.processedContent =
None 110 self.outputFormat =
None 112 self.altTagsMask =
None 113 self.errorMask = APP_CONSTS.ERROR_OK
115 self.output_data =
None 116 self.dbWrapper =
None 117 self.datetimeTemplateTypes = []
118 self.useCurrentYear = 0
123 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
125 foundation.CementApp.setup(self)
131 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
133 foundation.CementApp.run(self)
139 self.loadLogConfigFile()
145 self.loadScraperProperties()
150 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
152 self.logger.info(APP_CONSTS.LOGGER_DELIMITER_LINE)
159 self.config = ConfigParser.ConfigParser()
160 self.config.optionxform = str
161 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
162 if self.pargs.config:
163 self.config.read(self.pargs.config)
165 self.config.read(CONSTS.SCRAPER_CUSTOM_JSON_APP_CLASS_NAME)
167 self.config.read(self.configFile)
177 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
178 log_conf_file = self.config.get(
"Application",
"log")
179 logging.config.fileConfig(log_conf_file)
181 self.logger = Utils.MPLogger().
getLogger()
182 except Exception, err:
183 raise Exception(CONSTS.MSG_ERROR_LOAD_CONFIG +
" : " + str(err))
192 self.scraperPropFileName = self.config.get(
"Application",
"property_file_name")
196 self.useCurrentYear = self.config.getint(
"DateTimeType",
"useCurrentYear")
198 if self.config.has_section(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
199 self.datetimeTemplateTypes = []
200 for key, value
in self.config.
items(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
201 self.datetimeTemplateTypes.append(key)
202 if self.logger
is not None:
203 self.logger.debug(
'load form config: ' + str(key) +
' = ' + str(value))
205 self.datetimeTemplateTypes = self.TAGS_DATETIME_TEMPLATE_TYPES
206 if self.logger
is not None:
207 self.logger.debug(
"Config file hasn't section: " + str(self.OPTION_SECTION_DATETIME_TEMPLATE_TYPES))
210 dbTaskIniConfigFileName = self.config.get(self.__class__.__name__,
"db-task_ini")
211 config = ConfigParser.ConfigParser()
212 config.optionxform = str
213 readOk = config.read(dbTaskIniConfigFileName)
215 raise Exception(self.MSG_ERROR_WRONG_CONFIG_FILE_NAME +
": " + dbTaskIniConfigFileName)
224 if self.scraperPropFileName
is not None:
226 with open(self.scraperPropFileName,
"rb")
as fd:
227 scraperProperies = json.loads(fd.read())
228 self.properties = scraperProperies[self.__class__.__name__][CONSTS.PROPERTIES_KEY]
229 except Exception
as excp:
230 self.logger.debug(
">>> Some error with scraper property loads = " + str(excp))
236 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
238 input_pickled_object = sys.stdin.read()
240 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
241 scraper_in_data = pickle.loads(input_pickled_object)
242 except Exception
as err:
243 ExceptionLog.handler(self.logger, err,
'pickle.loads() error:')
244 self.logger.debug(
"input_pickled_object:\n" + str(input_pickled_object))
245 self.exitCode = EXIT_FAILURE
249 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
250 self.input_data = scraper_in_data
251 if self.input_data.batch_item.urlObj
is not None:
252 urlString = self.input_data.batch_item.urlObj.url
255 logMsg =
"BatchItem.siteId=" + str(self.input_data.batch_item.siteId) + \
256 ", BatchItem.urlId=" + str(self.input_data.batch_item.urlId) + \
257 ", BatchItem.urlObj.url=" + urlString
258 app.Profiler.messagesList.append(logMsg)
259 self.logger.info(
"Incoming data: %s", logMsg)
264 if self.input_data.output_format
is not None and "name" in self.input_data.output_format:
265 self.outputFormat = self.input_data.output_format[
"name"]
267 if self.outputFormat
is None and "templates" in self.input_data.batch_item.properties[
"template"]
and \
268 len(self.input_data.batch_item.properties[
"template"][
"templates"]) > 0
and \
269 "output_format" in self.input_data.batch_item.properties[
"template"][
"templates"][0]
and \
270 "name" in self.input_data.batch_item.properties[
"template"][
"templates"][0][
"output_format"]:
271 self.outputFormat = self.input_data.batch_item.properties[
"template"][
"templates"][0][
"output_format"][
"name"]
273 if "TAGS_MAPPING" in self.input_data.batch_item.properties
and \
274 self.input_data.batch_item.properties[
"TAGS_MAPPING"]
is not None:
276 self.altTagsMask = json.loads(self.input_data.batch_item.properties[
"TAGS_MAPPING"])
277 self.logger.debug(
">>> AltTags = " + str(self.altTagsMask))
278 except Exception
as exp:
279 self.logger.debug(
">>> Bad TAGS_MAPPING properties value, err=" + str(exp))
282 if (self.input_data
is not None)
and (self.input_data.processor_properties
is not None):
283 processor_properties = self.input_data.processor_properties
284 self.logger.debug(
"Processor's properties was taken from input data: %s" % processor_properties)
285 self.logger.debug(
"Processor's properties type: %s" % str(
type(processor_properties)))
286 if not isinstance(processor_properties, types.DictType):
287 processor_properties = json.loads(self.input_data.processor_properties)
288 self.logger.debug(
"Processor's properties was taken from input data: %s" % processor_properties)
289 self.properties.update(processor_properties)
290 except Exception
as err:
291 ExceptionLog.handler(self.logger, err,
'Error load properties from input data:')
293 self.algorithm_name = self.properties[CONSTS.ALGORITHM_KEY][CONSTS.ALGORITHM_NAME_KEY]
294 self.logger.debug(
"Algorithm : %s" % self.algorithm_name)
295 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
296 Utils.storePickleOnDisk(input_pickled_object, ENV_SCRAPER_STORE_PATH,
"scraper.in." + \
297 str(self.input_data.urlId))
298 if "metrics" in self.properties:
300 self.metrics = json.loads(self.properties[
"metrics"])
301 self.logger.debug(
">>> Metrics loads = " + str(self.metrics))
302 except Exception
as excp:
303 self.logger.debug(
">>> Metrcis dumps exception = " + str(excp))
306 sys.stdout = open(
"/dev/null",
"wb")
310 self.loadExtractors()
317 scraperResponses = self.jsonParserProcess()
321 self.logger.debug(
"scraperResponse:\n%s",
varDump(scraperResponses))
322 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
323 output_pickled_object = pickle.dumps(scraperResponses)
324 Utils.storePickleOnDisk(output_pickled_object, ENV_SCRAPER_STORE_PATH,
325 "scraper.out." + str(self.input_data.urlId))
326 print output_pickled_object
329 self.output_data = scraperResponses
330 except Exception
as err:
331 ExceptionLog.handler(self.logger, err,
'ScraperCustomJson process batch error:')
332 self.exitCode = EXIT_FAILURE
333 raise Exception(
'ScraperCustomJson process batch error:' + str(err))
341 if CONSTS.MODULES_KEY
in self.properties
and self.algorithm_name
in self.properties[CONSTS.MODULES_KEY]:
342 modules = self.properties[CONSTS.MODULES_KEY][self.algorithm_name]
344 self.logger.debug(
">>> No moduler_key or algorithm_name in self.properties")
347 self.logger.debug(
"Algorithm name: <%s>" % (self.algorithm_name))
348 self.logger.debug(
"Modules: %s" % modules)
351 for module
in modules:
352 exrtactor = self.createModule(module)
354 if exrtactor
is not None:
355 self.extractors.append(exrtactor)
358 self.logger.debug(
"*******************")
359 self.logger.debug(
"Loaded extractors:")
360 for exrtactor
in self.extractors:
361 self.logger.debug(exrtactor.name)
362 self.logger.debug(
"*******************")
364 except Exception
as err:
365 ExceptionLog.handler(self.logger, err, MSG_ERROR_LOAD_EXTRACTORS)
377 appInst = (module_name, eval(module_name)(self.config,
None, self.urlHost, self.properties))[1]
378 self.logger.debug(
"%s has been created!" % module_name)
379 except Exception
as err:
380 ExceptionLog.handler(self.logger, err,
"Can't create module %s. Error is:" % (module_name))
388 extractor = next(self.itr)
389 except StopIteration:
398 resource_set[
"url"] = self.input_data.url
399 resource_set[
"resId"] = self.input_data.urlId
400 resource_set[
"siteId"] = self.input_data.siteId
401 resource_set[
"raw_html"] = jsonElem
405 self.extractor = self.getNextBestExtractor()
406 self.logger.debug(
"get best matching extractor: " + str(self.extractor))
409 collectResult =
Result(self.config, self.input_data.urlId, self.metrics)
411 while self.extractor:
412 result =
Result(self.config, self.input_data.urlId, self.metrics)
413 self.logger.debug(
">>> TAG BEGIN extractor = " + str(self.extractor))
414 result = self.extractor.extractTags(resource, result)
416 self.logger.debug(
">>> TAG END")
417 empty_tags = result.getEmptyTags()
418 self.logger.debug(
"get list of empty tags from result: " + str(empty_tags))
419 filled_tags = result.getFilledTags()
420 self.logger.debug(
"get list of filled_tags from result: " + str(filled_tags))
421 self.extractor = self.getNextBestExtractor()
422 self.logger.debug(
"get best matching extractor: " + str(self.extractor))
424 for key
in result.tags:
425 if key
not in collectResult.tags
or not collectResult.isTagFilled(key):
426 collectResult.tags[key] = copy.deepcopy(result.tags[key])
428 self.logger.debug(
">>> EXIT LOOP")
429 ret = [collectResult] + ret
435 if localOutputFormat ==
"json":
437 localStr = json.dumps(elem, ensure_ascii=
False)
438 if localStr[0] ==
'\"' or localStr[0] ==
'\'':
439 localStr = localStr[1:]
440 if localStr[-1] ==
'\"' or localStr[-1] ==
'\'':
441 localStr = localStr[0:-1]
444 elif localOutputFormat ==
"html" or localOutputFormat ==
"xml":
445 ret = xml.sax.saxutils.escape(elem, {
"'":
"'",
"\"" :
"""})
446 elif localOutputFormat ==
"sql":
448 ret = Utils.escape(elem)
456 for key
in response.tags:
457 if "data" in response.tags[key]:
458 if isinstance(response.tags[key][
"data"], types.ListType):
459 for i, elem
in enumerate(response.tags[key][
"data"]):
460 response.tags[key][
"data"][i] = self.formatOutpuElement(elem, localOutputFormat)
461 elif isinstance(response.tags[key][
"data"], types.StringTypes):
462 response.tags[key][
"data"] = self.formatOutpuElement(response.tags[key][
"data"], localOutputFormat)
468 if self.extractors
is not None:
469 self.itr = iter(sorted(self.extractors, key=
lambda extractor: 0, reverse=
True))
470 self.logger.debug(
"Extractors: %s" %
varDump(self.itr))
472 responses = self.resourceExtraction(jsonElem)
473 for response
in responses:
474 response.metricsPrecalculate()
475 response.stripResult()
477 self.addCustomTag(result=response, tag_name=CONSTS.TAG_SOURCE_URL,
478 tag_value=[str(self.input_data.url)])
480 if CONSTS.LANG_PROP_NAME
in self.properties:
483 langDetector.process(response, self.logger)
484 langTagsDict = langDetector.getLangTags()
485 self.logger.debug(
"langTagsDict: %s",
varDump(langTagsDict))
488 for tagName, langValue
in langTagsDict.items():
489 self.addCustomTag(result=response, tag_name=tagName, tag_value=langValue)
491 summaryLang = langDetector.getSummaryLang(response, self.logger)
492 self.addCustomTag(result=response, tag_name=CONSTS.TAG_SUMMARY_LANG, tag_value=summaryLang)
494 pubdate, timezone = self.normalizeDatetime(response, self.algorithm_name)
495 if pubdate
is not None:
496 self.pubdate = pubdate
497 self.logger.debug(
"Pubdate from 'pubdate': " + str(self.pubdate))
500 self.pubdate = self.pubdateMonthOrder(self.pubdate, self.input_data.batch_item.properties, self.input_data.url)
503 self.input_data.batch_item.urlObj.pDate = self.pubdate
504 self.pubdate = FieldsSQLExpressionEvaluator.evaluatePDateTime(self.input_data.batch_item.properties,
506 self.input_data.batch_item.urlObj,
511 self.pubdate, timezone = self.pubdateTransform(self.pubdate,
513 self.input_data.batch_item.properties,
517 self.addCustomTag(result=response, tag_name=CONSTS.TAG_PUBDATE_TZ, tag_value=[timezone])
519 if "pubdate" in response.tags
and "data" in response.tags[
"pubdate"]
and \
520 len(response.tags[
"pubdate"][
"data"]) > 0:
521 response.tags[
"pubdate"][
"data"][0] = self.pubdate
523 if self.outputFormat
is not None:
524 self.formatOutputData(response, self.outputFormat)
526 self.logger.debug(
">>> Warning, can't extracr output format")
527 response.recalcTagMaskCount(
None, self.altTagsMask)
528 self.tagsCount = response.tagsCount
529 self.tagsMask = response.tagsMask
531 self.logger.debug(
"self.tagsCount: %s", self.tagsCount)
532 self.logger.debug(
"self.tagsMasks: %s", self.tagsMask)
534 self.logger.debug(
">>> Resp: %s\n",
varDump(response))
537 response.finish = time.time()
538 response.data[
"time"] =
"%s" % (response.finish - response.start)
540 response = self.applyHTTPRedirectLink(self.input_data.batch_item.siteId, self.input_data.batch_item.urlObj.url,
541 self.input_data.batch_item.properties, response)
543 self.getProcessedContent(responses)
551 self.processedContent = {}
552 self.processedContent[
"default"] = result[0]
553 self.processedContent[
"internal"] = result
554 self.processedContent[
"custom"] = []
555 self.tagsCount = result[0].tagsCount
556 self.tagsMask = result[0].tagsMask
558 if "pubdate" in result[0].tags
and "data" in result[0].tags[
"pubdate"]
and \
559 len(result[0].tags[
"pubdate"][
"data"]) > 0:
560 self.pubdate = result[0].tags[
"pubdate"][
"data"][0]
561 self.logger.debug(
'>>>> Set self.pubdate = ' + str(self.pubdate))
570 self.processedContent =
None 571 self.errorMask = APP_CONSTS.ERROR_OK
572 self.jsonParserExtractor(jsonElem)
573 return ScraperResponse(self.tagsCount, self.tagsMask, self.pubdate, self.processedContent, self.errorMask)
579 localResult =
Result(self.config, self.input_data.urlId, self.metrics)
581 self.addCustomTag(result=localResult, tag_name=CONSTS.TAG_SOURCE_URL, tag_value=[str(self.input_data.url)])
582 self.getProcessedContent([localResult])
583 return ScraperResponse(0, 0, self.pubdate, self.processedContent, APP_CONSTS.ERROR_MASK_SCRAPER_ERROR)
592 rawDataJson = json.loads(self.input_data.raw_content)
593 except Exception
as excp:
594 self.logger.debug(
">>> jsonParserProcess wrong rawData json: " + str(excp))
596 self.logger.debug(
"!!! type(rawDataJson) = %s", str(
type(rawDataJson)))
597 if not isinstance(rawDataJson, list):
598 self.logger.debug(
"!!! rawDataJson: %s",
varDump(rawDataJson))
601 if rawDataJson
is not None and isinstance(rawDataJson, list):
602 for elem
in rawDataJson:
603 if isinstance(elem, list):
604 for internalElem
in elem:
605 ret.append(self.fillScraperResponse(internalElem))
607 ret.append(self.fillScraperResponse(elem))
609 self.logger.debug(
">>> rawDataJson structure not List type")
612 ret.append(self.generateEmptyResponse())
def formatOutpuElement(self, elem, localOutputFormat)
def formatOutputData(self, response, localOutputFormat)
def jsonParserProcess(self)
def __init__(self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None)
def loadLogConfigFile(self)
def jsonParserExtractor(self, jsonElem)
def loadScraperProperties(self)
def getProcessedContent(self, result)
def getNextBestExtractor(self)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
def generateEmptyResponse(self)
def fillScraperResponse(self, jsonElem)
def createModule(self, module_name)
def resourceExtraction(self, jsonElem)