236 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
238 input_pickled_object = sys.stdin.read()
240 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
241 scraper_in_data = pickle.loads(input_pickled_object)
242 except Exception
as err:
243 ExceptionLog.handler(self.logger, err,
'pickle.loads() error:')
244 self.logger.debug(
"input_pickled_object:\n" + str(input_pickled_object))
245 self.exitCode = EXIT_FAILURE
249 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
250 self.input_data = scraper_in_data
251 if self.input_data.batch_item.urlObj
is not None:
252 urlString = self.input_data.batch_item.urlObj.url
255 logMsg =
"BatchItem.siteId=" + str(self.input_data.batch_item.siteId) + \
256 ", BatchItem.urlId=" + str(self.input_data.batch_item.urlId) + \
257 ", BatchItem.urlObj.url=" + urlString
258 app.Profiler.messagesList.append(logMsg)
259 self.logger.info(
"Incoming data: %s", logMsg)
264 if self.input_data.output_format
is not None and "name" in self.input_data.output_format:
265 self.outputFormat = self.input_data.output_format[
"name"]
267 if self.outputFormat
is None and "templates" in self.input_data.batch_item.properties[
"template"]
and \
268 len(self.input_data.batch_item.properties[
"template"][
"templates"]) > 0
and \
269 "output_format" in self.input_data.batch_item.properties[
"template"][
"templates"][0]
and \
270 "name" in self.input_data.batch_item.properties[
"template"][
"templates"][0][
"output_format"]:
271 self.outputFormat = self.input_data.batch_item.properties[
"template"][
"templates"][0][
"output_format"][
"name"]
273 if "TAGS_MAPPING" in self.input_data.batch_item.properties
and \
274 self.input_data.batch_item.properties[
"TAGS_MAPPING"]
is not None:
276 self.altTagsMask = json.loads(self.input_data.batch_item.properties[
"TAGS_MAPPING"])
277 self.logger.debug(
">>> AltTags = " + str(self.altTagsMask))
278 except Exception
as exp:
279 self.logger.debug(
">>> Bad TAGS_MAPPING properties value, err=" + str(exp))
282 if (self.input_data
is not None)
and (self.input_data.processor_properties
is not None):
283 processor_properties = self.input_data.processor_properties
284 self.logger.debug(
"Processor's properties was taken from input data: %s" % processor_properties)
285 self.logger.debug(
"Processor's properties type: %s" % str(
type(processor_properties)))
286 if not isinstance(processor_properties, types.DictType):
287 processor_properties = json.loads(self.input_data.processor_properties)
288 self.logger.debug(
"Processor's properties was taken from input data: %s" % processor_properties)
289 self.properties.update(processor_properties)
290 except Exception
as err:
291 ExceptionLog.handler(self.logger, err,
'Error load properties from input data:')
293 self.algorithm_name = self.properties[CONSTS.ALGORITHM_KEY][CONSTS.ALGORITHM_NAME_KEY]
294 self.logger.debug(
"Algorithm : %s" % self.algorithm_name)
295 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
296 Utils.storePickleOnDisk(input_pickled_object, ENV_SCRAPER_STORE_PATH,
"scraper.in." + \
297 str(self.input_data.urlId))
298 if "metrics" in self.properties:
300 self.metrics = json.loads(self.properties[
"metrics"])
301 self.logger.debug(
">>> Metrics loads = " + str(self.metrics))
302 except Exception
as excp:
303 self.logger.debug(
">>> Metrcis dumps exception = " + str(excp))
306 sys.stdout = open(
"/dev/null",
"wb")
310 self.loadExtractors()
317 scraperResponses = self.jsonParserProcess()
321 self.logger.debug(
"scraperResponse:\n%s",
varDump(scraperResponses))
322 if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
323 output_pickled_object = pickle.dumps(scraperResponses)
324 Utils.storePickleOnDisk(output_pickled_object, ENV_SCRAPER_STORE_PATH,
325 "scraper.out." + str(self.input_data.urlId))
326 print output_pickled_object
329 self.output_data = scraperResponses
330 except Exception
as err:
331 ExceptionLog.handler(self.logger, err,
'ScraperCustomJson process batch error:')
332 self.exitCode = EXIT_FAILURE
333 raise Exception(
'ScraperCustomJson process batch error:' + str(err))
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)