ResponseExtractor Class main functional, class inherits from foundation.CementApp.
More...
|
def | __init__ (self) |
| constructor More...
|
|
def | setup (self) |
| setup application More...
|
|
def | initLogger (self, configName) |
| load log config file More...
|
|
def | readFile (self, templateFile) |
| Read file. More...
|
|
def | writeFile (self, fileName, outBuffer) |
| Write file. More...
|
|
def | run (self) |
| run application More...
|
|
def | getNewsItem (self, contentObj, tags=None) |
| Make output content by substitution of the template's parts. More...
|
|
def | getDefaultItem (self, contentObj, tags=None) |
| Make output content by substitution of the template's parts. More...
|
|
def | getTagValueByName (self, tagName, item, responseFormat) |
| Check is tag present in response item by the name. More...
|
|
def | parse (self, inputObject, inputFormat, maxItems=-1, startFrom=0, tags=None) |
| Parse the input json and make output collection. More...
|
|
def | logResultedStatistics (self, inputObject, items) |
| Detects the format of response object. More...
|
|
def | detectFormat (self, contentObj) |
| Detects the format of response object. More...
|
|
def | jsonLoadsSafe (self, jsonString) |
| Parse json and return dict if okay or None if not. More...
|
|
def | process (self, inputBuffer, inputFormat=FROMAT_AUTO, maxItems=-1, startFrom=0, tags=None) |
| process main operations More...
|
|
ResponseExtractor Class main functional, class inherits from foundation.CementApp.
Definition at line 33 of file ResponseExtractor.py.
◆ __init__()
def app.ResponseExtractor.ResponseExtractor.__init__ |
( |
|
self | ) |
|
constructor
Definition at line 66 of file ResponseExtractor.py.
68 foundation.CementApp.__init__(self)
72 self.exitCode = APP_CONSTS.EXIT_SUCCESS
73 self.initTagsUniqueHashConfig =
'' 74 self.initTagsLimitsConfig =
'' 75 self.extendedLog =
False 76 self.results = self.RESULTS_FORMAT_JSON
77 self.itemDelimiter =
"\n" def __init__(self)
constructor
◆ __loadAppConfig()
def app.ResponseExtractor.ResponseExtractor.__loadAppConfig |
( |
|
self, |
|
|
|
configName |
|
) |
| |
|
private |
load application config file
- Parameters
-
configName | - name of application config file |
- Returns
- - log config file name
Definition at line 108 of file ResponseExtractor.py.
113 config = ConfigParser.ConfigParser()
114 config.optionxform = str
116 readOk = config.read(configName)
119 raise Exception(self.MSG_ERROR_WRONG_CONFIG_FILE_NAME +
": " + configName)
121 if config.has_section(APP_CONSTS.CONFIG_APPLICATION_SECTION_NAME):
122 confLogFileName = str(config.get(APP_CONSTS.CONFIG_APPLICATION_SECTION_NAME, self.CONFIG_OPTION_LOG))
124 self.itemDelimiter = str(config.get(APP_CONSTS.CONFIG_APPLICATION_SECTION_NAME,
125 self.CONFIG_OPTION_ITEM_DELIMITER))
126 except Exception, err:
127 raise Exception(self.MSG_ERROR_LOAD_APP_CONFIG +
' ' + str(err))
129 return confLogFileName
def __loadAppConfig(self, configName)
◆ detectFormat()
def app.ResponseExtractor.ResponseExtractor.detectFormat |
( |
|
self, |
|
|
|
contentObj |
|
) |
| |
Detects the format of response object.
- Parameters
-
contentObj | response object |
- Returns
- format code
Definition at line 379 of file ResponseExtractor.py.
379 def detectFormat(self, contentObj):
382 if isinstance(contentObj, dict):
384 inputFormat = self.FROMAT_INTERNAL
385 elif isinstance(contentObj, list)
and len(contentObj) > 0:
387 inputFormat = self.FROMAT_NEWS
◆ getDefaultItem()
def app.ResponseExtractor.ResponseExtractor.getDefaultItem |
( |
|
self, |
|
|
|
contentObj, |
|
|
|
tags = None |
|
) |
| |
Make output content by substitution of the template's parts.
- Parameters
-
contentObj | the object from response item in the default internal format |
tags | - list of the tags names |
- Returns
- the dict of tags and values
Definition at line 253 of file ResponseExtractor.py.
253 def getDefaultItem(self, contentObj, tags=None):
256 if 'default' in contentObj
and 'data' in contentObj[
'default']
and 'tagList' in contentObj[
'default'][
'data']
and\
257 isinstance(contentObj[
'default'][
'data'][
'tagList'], list)
and len(contentObj[
'default'][
'data'][
'tagList']) > 0:
258 contentObj = contentObj[
'default'][
'data'][
'tagList'][0]
260 raise Exception(
'Wrong format of the contentObj, structure checks not passed!')
263 self.logger.debug(
"Internal format item processing:\n%s", str(contentObj))
266 for tagItem
in contentObj:
267 tagName = tagItem[
'name']
269 tagValue = self.getTagValueByName(tagName, contentObj, self.FROMAT_INTERNAL)
270 ret[tagName] = tagValue
◆ getNewsItem()
def app.ResponseExtractor.ResponseExtractor.getNewsItem |
( |
|
self, |
|
|
|
contentObj, |
|
|
|
tags = None |
|
) |
| |
Make output content by substitution of the template's parts.
- Parameters
-
contentObj | the object from response item in News format |
tags | - list of the tags names |
- Returns
- the dict of tags and values
Definition at line 231 of file ResponseExtractor.py.
231 def getNewsItem(self, contentObj, tags=None):
235 self.logger.debug(
"News format item processing:\n%s", str(contentObj))
241 for tagName
in contentObj:
243 ret[tagName] = self.getTagValueByName(tagName, contentObj, self.FROMAT_NEWS)
◆ getTagValueByName()
def app.ResponseExtractor.ResponseExtractor.getTagValueByName |
( |
|
self, |
|
|
|
tagName, |
|
|
|
item, |
|
|
|
responseFormat |
|
) |
| |
Check is tag present in response item by the name.
- Parameters
-
tagName | the name of the tag |
item | - the one tags set item of the itemObject in the scraper response |
responseFormat | - format of the scraper response |
- Returns
- true if tag is present
Definition at line 281 of file ResponseExtractor.py.
281 def getTagValueByName(self, tagName, item, responseFormat):
284 if responseFormat == self.FROMAT_NEWS:
286 ret = item[tagName].
decode(
'string_escape')
289 self.logger.debug(
"Tag `%s` not found as News format, empty value assumed", tagName)
290 elif responseFormat == self.FROMAT_INTERNAL:
293 if tag[
'name'] == tagName:
295 if len(tag[
'data']) > 0:
296 ret = tag[
'data'][0].
decode(
'string_escape')
298 if not found
and self.extendedLog:
299 self.logger.debug(
"Tag `%s` not found as internal format, empty value assumed", tagName)
302 self.logger.debug(
"Format %s not supported", str(responseFormat))
◆ initLogger()
def app.ResponseExtractor.ResponseExtractor.initLogger |
( |
|
self, |
|
|
|
configName |
|
) |
| |
load log config file
- Parameters
-
configName | - name of log rtc-finalizer config file |
- Returns
- - None
Definition at line 136 of file ResponseExtractor.py.
136 def initLogger(self, configName):
138 if isinstance(configName, str)
and len(configName) == 0:
139 raise Exception(self.MSG_ERROR_EMPTY_CONFIG_FILE_NAME)
141 logging.config.fileConfig(configName)
144 self.logger = Utils.MPLogger().
getLogger()
146 except Exception, err:
147 raise Exception(self.MSG_ERROR_READ_LOG_CONFIG +
' ' + str(err))
◆ jsonLoadsSafe()
def app.ResponseExtractor.ResponseExtractor.jsonLoadsSafe |
( |
|
self, |
|
|
|
jsonString |
|
) |
| |
Parse json and return dict if okay or None if not.
- Parameters
-
- Returns
- resulted dict
Definition at line 399 of file ResponseExtractor.py.
403 if jsonString
is not None:
404 ret = json.loads(jsonString)
405 except Exception, err:
406 self.logger.
error(
"Error pars json: %s\n%s", str(err), jsonString)
def jsonLoadsSafe(jsonString, default=None, log=None)
◆ logResultedStatistics()
def app.ResponseExtractor.ResponseExtractor.logResultedStatistics |
( |
|
self, |
|
|
|
inputObject, |
|
|
|
items |
|
) |
| |
Detects the format of response object.
- Parameters
-
inputObject | |
items | in results set |
Definition at line 371 of file ResponseExtractor.py.
371 def logResultedStatistics(self, inputObject, items):
372 self.logger.debug(
"Items detected %s, output: %s", str(len(inputObject[
"itemsList"][0][
"itemObject"])), str(items))
◆ parse()
def app.ResponseExtractor.ResponseExtractor.parse |
( |
|
self, |
|
|
|
inputObject, |
|
|
|
inputFormat, |
|
|
|
maxItems = -1 , |
|
|
|
startFrom = 0 , |
|
|
|
tags = None |
|
) |
| |
Parse the input json and make output collection.
- Parameters
-
inputObject | json |
inputFormat | of the inputObject |
maxItems | - max items to process |
startFrom | - processing start from item |
tags | - list of tags names to get |
- Returns
- the output buffer after all macro variables are substituted
Definition at line 315 of file ResponseExtractor.py.
315 def parse(self, inputObject, inputFormat, maxItems=-1, startFrom=0, tags=None):
322 for item
in inputObject[
"itemsList"][0][
"itemObject"]:
323 if maxItems > -1
and i == maxItems:
326 if startFrom > 0
and s < startFrom:
330 if len(item[
"processedContents"]) > 0:
332 contentObj = json.loads(base64.b64decode(item[
"processedContents"][0][
"buffer"]))
333 except Exception, err:
334 self.logger.
error(
"Error get contentObj or cDate: %s, possible wrong json in buffer:\n%s", str(err),
335 str(item[
"processedContents"][0][
"buffer"]))
339 inputFormatLocal = self.detectFormat(contentObj)
340 if inputFormatLocal
is None:
341 self.logger.info(
"Unsupported item object format or empty list:\n%s", str(contentObj))
344 if inputFormatLocal == self.FROMAT_INTERNAL:
346 item = self.getDefaultItem(contentObj, tags)
347 elif inputFormatLocal == self.FROMAT_NEWS:
349 item = self.getNewsItem(contentObj[0], tags)
350 elif inputFormatLocal == self.FROMAT_RSS_FEED:
355 except (KeyboardInterrupt, SystemExit):
357 except Exception, err:
358 self.logger.
error(
"Error process item: %s, contentObj:\n%s", str(err), str(contentObj))
359 self.logger.debug(
"%s", Utils.getTracebackInfo())
362 self.logResultedStatistics(inputObject, i)
◆ process()
def app.ResponseExtractor.ResponseExtractor.process |
( |
|
self, |
|
|
|
inputBuffer, |
|
|
|
inputFormat = FROMAT_AUTO , |
|
|
|
maxItems = -1 , |
|
|
|
startFrom = 0 , |
|
|
|
tags = None |
|
) |
| |
process main operations
- Parameters
-
inputBuffer | - te input buffer, supposes the json from DCC URL_CONTENT response |
inputFormat | of the input buffer, including News, Template, RSS-Feed and so on |
maxItems | - max items |
startFrom | - start from item |
tags | list of tags names |
- Returns
- formatted string buffer
Definition at line 419 of file ResponseExtractor.py.
419 def process(self, inputBuffer, inputFormat=FROMAT_AUTO, maxItems=-1, startFrom=0, tags=None):
423 self.logger.debug(
"Processing started, tags: %s", str(tags))
425 inputObject = json.loads(inputBuffer)
427 items = self.parse(inputObject, inputFormat, maxItems, startFrom, tags)
430 if self.results == self.RESULTS_FORMAT_JSON:
431 ret = json.dumps(items, indent=2, ensure_ascii=
False)
432 elif self.results == self.RESULTS_FORMAT_CSV_LINE:
436 buf += tagName +
'=' + item[tagName] +
',' 437 if buf[:-1].strip() !=
'':
438 ret += buf[:-1] + self.itemDelimiter.replace(
"\\n",
"\n")
439 elif self.results == self.RESULTS_FORMAT_FIELD_LINE:
443 buf += tagName +
'=' + item[tagName] +
"\n" 444 if buf[:-1].strip() !=
'':
445 ret += buf[:-1] + self.itemDelimiter.replace(
"\\n",
"\n")
449 except Exception, err:
450 ExceptionLog.handler(self.logger, err,
"Error:")
451 raise Exception(self.MSG_ERROR_PROCESSING_REQUEST +
' ' + str(err))
◆ readFile()
def app.ResponseExtractor.ResponseExtractor.readFile |
( |
|
self, |
|
|
|
templateFile |
|
) |
| |
Read file.
- Parameters
-
inFile | - name of file to read |
- Returns
- - the buffer
Definition at line 154 of file ResponseExtractor.py.
155 with open(templateFile,
'r') as f:
◆ run()
def app.ResponseExtractor.ResponseExtractor.run |
( |
|
self | ) |
|
run application
Definition at line 171 of file ResponseExtractor.py.
173 foundation.CementApp.run(self)
175 startTime = time.time()
177 if self.pargs.config:
178 self.initLogger(self.__loadAppConfig(self.pargs.config))
180 raise Exception(self.MSG_ERROR_LOAD_APP_CONFIG)
183 inputBuffer = self.readFile(self.pargs.input)
185 inputBuffer = sys.stdin.read()
187 if self.pargs.format:
188 inputFormat = int(self.pargs.format)
192 if self.pargs.maxitems:
193 maxItems = int(self.pargs.maxitems)
197 startFrom = int(self.pargs.start)
201 self.extendedLog = bool(int(self.pargs.extended))
204 tags = self.pargs.tags.split(
',')
208 if self.pargs.results:
209 self.results = int(self.pargs.results)
212 outputBuffer = self.process(inputBuffer, inputFormat, maxItems, startFrom, tags)
214 self.logger.info(
"Total time: %s", str(time.time() - startTime))
216 if self.pargs.output:
217 self.writeFile(self.pargs.output, outputBuffer)
223 self.logger.info(APP_CONSTS.LOGGER_DELIMITER_LINE)
◆ setup()
def app.ResponseExtractor.ResponseExtractor.setup |
( |
|
self | ) |
|
setup application
Definition at line 80 of file ResponseExtractor.py.
82 foundation.CementApp.setup(self)
83 self.args.add_argument(
'-c',
'--config', action=
'store', metavar=
'config_file', help=
'config ini-file')
84 self.args.add_argument(
'-i',
'--input', action=
'store', metavar=
'input_json_file',
85 help=
'input json file of the URL_CONTENT response, if omitted the stdin read used')
86 self.args.add_argument(
'-o',
'--output', action=
'store', metavar=
'output_file, if omitted the stdout print used',
87 help=
'input file, if omitted the stdout write used')
88 self.args.add_argument(
'-f',
'--format', action=
'store', metavar=
'input_json_file_format',
89 help=
'input json file buffer format: -1 - auto (default if omitted) 0 - internal,' + \
90 ' 1 - news, 2 - rss-feed')
91 self.args.add_argument(
'-m',
'--maxitems', action=
'store', metavar=
'max_items',
92 help=
'max items number to read')
93 self.args.add_argument(
'-s',
'--start', action=
'store', metavar=
'start_from',
94 help=
'start from item')
95 self.args.add_argument(
'-e',
'--extended', action=
'store', metavar=
'extended',
96 help=
'extended log with additional debug information')
97 self.args.add_argument(
'-t',
'--tags', action=
'store', metavar=
'tags',
98 help=
'csv tags fields names list, all fields from response if omitted')
99 self.args.add_argument(
'-r',
'--results', action=
'store', metavar=
'results',
100 help=
'results format: 0 - csv fields names list one item per line, ' + \
101 '1 - fields list one field per line, 2 - json (default if omitted)')
◆ writeFile()
def app.ResponseExtractor.ResponseExtractor.writeFile |
( |
|
self, |
|
|
|
fileName, |
|
|
|
outBuffer |
|
) |
| |
Write file.
- Parameters
-
fileName | - name of file to write |
outBuffer | - buffer to write |
Definition at line 165 of file ResponseExtractor.py.
165 def writeFile(self, fileName, outBuffer):
166 with open(fileName,
'w')
as f:
◆ batch
app.ResponseExtractor.ResponseExtractor.batch |
◆ CONFIG_OPTION_ITEM_DELIMITER
string app.ResponseExtractor.ResponseExtractor.CONFIG_OPTION_ITEM_DELIMITER = "itemDelimiter" |
|
static |
◆ CONFIG_OPTION_LOG
string app.ResponseExtractor.ResponseExtractor.CONFIG_OPTION_LOG = 'log' |
|
static |
◆ exitCode
app.ResponseExtractor.ResponseExtractor.exitCode |
◆ extendedLog
app.ResponseExtractor.ResponseExtractor.extendedLog |
◆ FROMAT_AUTO
int app.ResponseExtractor.ResponseExtractor.FROMAT_AUTO = -1 |
|
static |
◆ FROMAT_INTERNAL
int app.ResponseExtractor.ResponseExtractor.FROMAT_INTERNAL = 0 |
|
static |
◆ FROMAT_NEWS
int app.ResponseExtractor.ResponseExtractor.FROMAT_NEWS = 1 |
|
static |
◆ FROMAT_RSS_FEED
int app.ResponseExtractor.ResponseExtractor.FROMAT_RSS_FEED = 2 |
|
static |
◆ initTagsLimitsConfig
app.ResponseExtractor.ResponseExtractor.initTagsLimitsConfig |
◆ initTagsUniqueHashConfig
app.ResponseExtractor.ResponseExtractor.initTagsUniqueHashConfig |
◆ itemDelimiter
app.ResponseExtractor.ResponseExtractor.itemDelimiter |
◆ logger
app.ResponseExtractor.ResponseExtractor.logger |
◆ MSG_ERROR_EMPTY_CONFIG_FILE_NAME
string app.ResponseExtractor.ResponseExtractor.MSG_ERROR_EMPTY_CONFIG_FILE_NAME = "Config file name is empty." |
|
static |
◆ MSG_ERROR_LOAD_APP_CONFIG
string app.ResponseExtractor.ResponseExtractor.MSG_ERROR_LOAD_APP_CONFIG = "Error loading application config file." |
|
static |
◆ MSG_ERROR_PARSE_CMD_PARAMS
string app.ResponseExtractor.ResponseExtractor.MSG_ERROR_PARSE_CMD_PARAMS = "Error parse command line parameters." |
|
static |
◆ MSG_ERROR_PROCESSING_REQUEST
string app.ResponseExtractor.ResponseExtractor.MSG_ERROR_PROCESSING_REQUEST = "Error processing input data." |
|
static |
◆ MSG_ERROR_READ_LOG_CONFIG
string app.ResponseExtractor.ResponseExtractor.MSG_ERROR_READ_LOG_CONFIG = "Error read log config file." |
|
static |
◆ MSG_ERROR_WRONG_CONFIG_FILE_NAME
string app.ResponseExtractor.ResponseExtractor.MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong" |
|
static |
◆ results
app.ResponseExtractor.ResponseExtractor.results |
◆ RESULTS_FORMAT_CSV_LINE
int app.ResponseExtractor.ResponseExtractor.RESULTS_FORMAT_CSV_LINE = 0 |
|
static |
◆ RESULTS_FORMAT_FIELD_LINE
int app.ResponseExtractor.ResponseExtractor.RESULTS_FORMAT_FIELD_LINE = 1 |
|
static |
◆ RESULTS_FORMAT_JSON
int app.ResponseExtractor.ResponseExtractor.RESULTS_FORMAT_JSON = 2 |
|
static |
The documentation for this class was generated from the following file: