HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.
2.0.0-chaika
Hierarchical Cluster Engine Python language binding
|
Additional Inherited Members | |
Public Member Functions inherited from dc_processor.Scraper.Scraper | |
def | __init__ (self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None) |
def | setup (self) |
def | run (self) |
def | checkDOMElement (self, elem) |
def | adjustPartialReferences (self, response) |
def | adjustTitle (self, response) |
def | adjustLinkURL (self, response) |
def | normalizeAuthor (self, confProp, procProp, response) |
def | normalizeDatetime (self, response, algorithmName) |
def | extractPubDate (self, response, dataTagName) |
def | pubdateTransform (self, rawPubdate, rawTimezone, properties, urlString) |
def | refineBadDateTags (self, response) |
def | calcUrlDomainCrc (self, url) |
def | process (self, config) |
def | applyPubdate (self, response, pubdate) |
def | preparseResponse (self, response) |
def | formatOutpuElement (self, elem, localOutputFormat) |
def | formatOutputData (self, response, localOutputFormat) |
def | getTemplate (self, explicit=True) |
def | postprocessing (self, result, rule, tag) |
def | templateExtraction (self, config, urlHost) |
def | addCustomTag (self, result, tag_name, tag_value) |
def | compileResults (self, result, resultsList, key, xPathPreparing=None) |
def | prepareResults (self, resultsList) |
def | elemUrlsCanoizator (self, data, baseUrl=None, firstDelim=' ', secondDelim=', useAdditionEncoding=False) |
def | dataUrlsCanonizator (self, data, baseUrl=None, useAdditionEncoding=False) |
def | formatTag (self, result, path, key, pathDict, isExtract) |
def | applyPostProcessing (self, result, key, postProcessingRE) |
def | processingHTMLData (self, htmlBuf, bufFormat) |
def | getBestDatatimeData (self, data) |
def | newsExtraction (self) |
def | commonResultOperations (self, result) |
def | replaceLoopValue (self, buf, replaceFrom, replaceTo) |
def | refineCommonText (self, tagName, result) |
def | extractAdditionTagsByScrapy (self, localResult, key, tagsXpaths) |
def | getNextBestExtractor (self) |
def | getProcessedContent (self, result) |
def | loadExtractors (self) |
def | processBatch (self) |
def | loadConfig (self) |
def | loadLogConfigFile (self) |
def | loadOptions (self) |
def | loadScraperProperties (self) |
def | createModule (self, module_name) |
def | getExtractorByName (self, extractorName) |
def | getExitCode (self) |
def | feedParserProcess (self) |
def | createArticle (self) |
def | parseFeed (self) |
def | extractPubdateRssFeed (self, siteId, url) |
def | extractFeedUrlRssFeed (self, siteId, url) |
def | extractBaseUrlRssFeed (self, siteId, url) |
def | getHeaderContent (self, siteId, url) |
def | getVariableFromHeaderContent (self, headerContent, name, makeDecode=True) |
def | pubdateMonthOrder (self, rawPubdate, properties, urlString) |
def | checkMediaTag (self, urlStringMedia) |
def | splitMediaTagString (self, urlStringMedia) |
def | applyHTTPRedirectLink (self, siteId, url, properties, response) |
def | getDomainsForUrlSourcesRules (self, urlSourcesRules) |
Public Attributes inherited from dc_processor.Scraper.Scraper | |
exitCode | |
itr | |
extractor | |
extractors | |
input_data | |
logger | |
sqliteTimeout | |
scraperPropFileName | |
properties | |
algorithm_name | |
pubdate | |
response.tagsLangDetecting(self.properties[CONSTS.LANG_PROP_NAME]) More... | |
message_queue | |
entry | |
article | |
outputFormat | |
errorMask | |
metrics | |
altTagsMask | |
tagsCount | |
tagsMask | |
processedContent | |
usageModel | |
configFile | |
output_data | |
urlHost | |
xpathSplitString | |
useCurrentYear | |
datetimeNewsNames | |
datetimeTemplateTypes | |
tagsTypes | |
attrConditions | |
dbWrapper | |
mediaLimitsHandler | |
urlSourcesRules | |
tagReduceMask | |
baseUrl | |
config | |
Static Public Attributes inherited from dc_processor.Scraper.Scraper | |
string | MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong" |
string | WWW_PREFIX = "www." |
Definition at line 387 of file ScraperMultiItemsTask.py.