HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_processor.Scraper.Scraper Class Reference
Inheritance diagram for dc_processor.Scraper.Scraper:
Collaboration diagram for dc_processor.Scraper.Scraper:

Classes

class  Meta
 

Public Member Functions

def __init__ (self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None)
 
def setup (self)
 
def run (self)
 
def checkDOMElement (self, elem)
 
def adjustPartialReferences (self, response)
 
def adjustTitle (self, response)
 
def adjustLinkURL (self, response)
 
def normalizeAuthor (self, confProp, procProp, response)
 
def normalizeDatetime (self, response, algorithmName)
 
def extractPubDate (self, response, dataTagName)
 
def pubdateTransform (self, rawPubdate, rawTimezone, properties, urlString)
 
def refineBadDateTags (self, response)
 
def calcUrlDomainCrc (self, url)
 
def process (self, config)
 
def applyPubdate (self, response, pubdate)
 
def preparseResponse (self, response)
 
def formatOutpuElement (self, elem, localOutputFormat)
 
def formatOutputData (self, response, localOutputFormat)
 
def getTemplate (self, explicit=True)
 
def postprocessing (self, result, rule, tag)
 
def templateExtraction (self, config, urlHost)
 
def addCustomTag (self, result, tag_name, tag_value)
 
def compileResults (self, result, resultsList, key, xPathPreparing=None)
 
def prepareResults (self, resultsList)
 
def elemUrlsCanoizator (self, data, baseUrl=None, firstDelim=' ', secondDelim=', useAdditionEncoding=False)
 
def dataUrlsCanonizator (self, data, baseUrl=None, useAdditionEncoding=False)
 
def formatTag (self, result, path, key, pathDict, isExtract)
 
def applyPostProcessing (self, result, key, postProcessingRE)
 
def processingHTMLData (self, htmlBuf, bufFormat)
 
def getBestDatatimeData (self, data)
 
def newsExtraction (self)
 
def commonResultOperations (self, result)
 
def replaceLoopValue (self, buf, replaceFrom, replaceTo)
 
def refineCommonText (self, tagName, result)
 
def extractAdditionTagsByScrapy (self, localResult, key, tagsXpaths)
 
def getNextBestExtractor (self)
 
def getProcessedContent (self, result)
 
def loadExtractors (self)
 
def processBatch (self)
 
def loadConfig (self)
 
def loadLogConfigFile (self)
 
def loadOptions (self)
 
def loadScraperProperties (self)
 
def createModule (self, module_name)
 
def getExtractorByName (self, extractorName)
 
def getExitCode (self)
 
def feedParserProcess (self)
 
def createArticle (self)
 
def parseFeed (self)
 
def extractPubdateRssFeed (self, siteId, url)
 
def extractFeedUrlRssFeed (self, siteId, url)
 
def extractBaseUrlRssFeed (self, siteId, url)
 
def getHeaderContent (self, siteId, url)
 
def getVariableFromHeaderContent (self, headerContent, name, makeDecode=True)
 
def pubdateMonthOrder (self, rawPubdate, properties, urlString)
 
def checkMediaTag (self, urlStringMedia)
 
def splitMediaTagString (self, urlStringMedia)
 
def applyHTTPRedirectLink (self, siteId, url, properties, response)
 
def getDomainsForUrlSourcesRules (self, urlSourcesRules)
 

Public Attributes

 exitCode
 
 itr
 
 extractor
 
 extractors
 
 input_data
 
 logger
 
 sqliteTimeout
 
 scraperPropFileName
 
 properties
 
 algorithm_name
 
 pubdate
 response.tagsLangDetecting(self.properties[CONSTS.LANG_PROP_NAME]) More...
 
 message_queue
 
 entry
 
 article
 
 outputFormat
 
 errorMask
 
 metrics
 
 altTagsMask
 
 tagsCount
 
 tagsMask
 
 processedContent
 
 usageModel
 
 configFile
 
 output_data
 
 urlHost
 
 xpathSplitString
 
 useCurrentYear
 
 datetimeNewsNames
 
 datetimeTemplateTypes
 
 tagsTypes
 
 attrConditions
 
 dbWrapper
 
 mediaLimitsHandler
 
 urlSourcesRules
 
 tagReduceMask
 
 baseUrl
 
 config
 

Static Public Attributes

string MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong"
 
string WWW_PREFIX = "www."
 

Detailed Description

Definition at line 106 of file Scraper.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_processor.Scraper.Scraper.__init__ (   self,
  usageModel = APP_CONSTS.APP_USAGE_MODEL_PROCESS,
  configFile = None,
  logger = None,
  inputData = None 
)

Definition at line 121 of file Scraper.py.

121  def __init__(self, usageModel=APP_CONSTS.APP_USAGE_MODEL_PROCESS, configFile=None, logger=None, inputData=None):
122  if usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
123  # call base class __init__ method
124  foundation.CementApp.__init__(self)
125 
126  self.exitCode = EXIT_SUCCESS
127  self.itr = None
128  self.extractor = None
129  self.extractors = []
130  self.input_data = inputData
131  self.logger = logger
132  self.sqliteTimeout = SQLITE_TIMEOUT
133  self.scraperPropFileName = None
134  self.properties = {}
135  self.algorithm_name = None
136  self.pubdate = None
137  self.message_queue = []
138  self.entry = None
139  self.article = None
140  self.outputFormat = None
141  self.errorMask = APP_CONSTS.ERROR_OK
142  self.metrics = None
143  self.altTagsMask = None
144  self.tagsCount = 0
145  self.tagsMask = 0
146  self.processedContent = None
147  self.usageModel = usageModel
148  self.configFile = configFile
149  self.output_data = None
150  self.urlHost = None
151  self.xpathSplitString = ' '
152  self.useCurrentYear = 0
153  self.datetimeNewsNames = []
154  self.datetimeTemplateTypes = []
155  self.tagsTypes = None
156  self.attrConditions = None
157  self.dbWrapper = None
158  self.mediaLimitsHandler = None
159  self.urlSourcesRules = None
160  self.tagReduceMask = DEFAULT_TAG_REDUCE_MASK
161  self.baseUrl = None
162 
163 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ addCustomTag()

def dc_processor.Scraper.Scraper.addCustomTag (   self,
  result,
  tag_name,
  tag_value 
)

Definition at line 1006 of file Scraper.py.

1006  def addCustomTag(self, result, tag_name, tag_value):
1007  if tag_name not in result.tags:
1008  data = {"extractor": "Base extractor", "data": "", "name": ""}
1009  data["data"] = tag_value
1010  data["name"] = tag_name
1011  data["xpath"] = None
1012  data["type"] = None
1013  data["extractor"] = self.__class__.__name__
1014  result.tags[tag_name] = data
1015 
1016 
1017 # def compileResults(self, result, resultsList, key, xPathPreparing=None):
1018 # for elem in resultsList:
1019 # if key in result.tags:
1020 # if result.tags[key]["xpath"] is None:
1021 # result.tags[key]["xpath"] = elem["obj"].tags[key]["xpath"]
1022 # else:
1023 # result.tags[key]["xpath"] += ' '
1024 # result.tags[key]["xpath"] += elem["obj"].tags[key]["xpath"]
1025 # if result.tags[key]["data"] is None or len(result.tags[key]["data"]) == 0:
1026 # result.tags[key]["data"] = elem["obj"].tags[key]["data"]
1027 # else:
1028 # if xPathPreparing is not None:
1029 # self.xpathSplitString = xPathPreparing.resolveDelimiter(elem, self.properties, self.xpathSplitString)
1030 # result.tags[key]["data"][0] += self.xpathSplitString
1031 # result.tags[key]["data"][0] += elem["obj"].tags[key]["data"][0]
1032 # else:
1033 # result.tags.update(elem["obj"].tags)
1034 
Here is the caller graph for this function:

◆ adjustLinkURL()

def dc_processor.Scraper.Scraper.adjustLinkURL (   self,
  response 
)

Definition at line 329 of file Scraper.py.

329  def adjustLinkURL(self, response):
330  flag = False
331  try:
332  if response.tags and "link" in response.tags:
333  self.logger.debug("resource has template with link tag. Try to adjust link.")
334  self.logger.debug("response.tags['link']: " + str(response.tags["link"]))
335  self.logger.debug("self.extractor: %s", str(self.extractor))
336  flag = True
337  if self.extractor:
338  self.logger.debug("Extractor exists")
339  if isinstance(response.tags["link"], basestring):
340  self.logger.debug("response has not have link tag")
341  self.extractor.addTag(result=response, tag_name="link", tag_value=[self.input_data.url])
342  # bypass
343  else:
344  response.tags["link"]["data"] = self.input_data.url
345  else:
346  if len(self.extractors) > 2:
347  self.extractors[2].addTag(result=response, tag_name="link", tag_value=[self.input_data.url])
348  else:
349  self.logger.debug(">>> Wrong! self.extractors list doesn't have 3'th element (index 2)")
350  self.logger.debug("TYPE response.tags['link']['data']" + str(type(response.tags["link"]["data"])))
351  else:
352  self.logger.debug("resource hasn't template with link tag. Don't need adjust link.")
353  except Exception as err:
354  ExceptionLog.handler(self.logger, err, MSG_ERROR_ADJUST_PR, (), \
355  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
356 
357  return flag
358 
359 
Here is the caller graph for this function:

◆ adjustPartialReferences()

def dc_processor.Scraper.Scraper.adjustPartialReferences (   self,
  response 
)

Definition at line 214 of file Scraper.py.

214  def adjustPartialReferences(self, response):
215  # self.logger.debug("!!! response.tags: " + varDump(response.tags))
216 # self.logger.debug("!!! self.input_data.template: " + varDump(self.input_data.template))
217 # self.logger.debug("self.input_data.url: %s", varDump(self.input_data.url))
218 # self.logger.debug("self.input_data.siteId: %s", varDump(self.input_data.siteId))
219 
220  if "link" in response.tags and isinstance(response.tags["link"], dict) and \
221  "media" in response.tags and isinstance(response.tags["media"], dict):
222  try:
223  url = None
224  if self.input_data.template and "link" in self.input_data.template:
225  self.logger.debug("url type: %s", str(type(response.tags["link"]["data"])))
226  if isinstance(response.tags["link"]["data"], basestring):
227  url = response.tags["link"]["data"]
228  else:
229  url = response.tags["link"]["data"][0]
230 
231  url = urlNormalization(self.baseUrl, url)
232  response.tags["link"]["data"] = url
233 
234  else:
235  url = self.input_data.url
236 
237 # self.logger.debug("link tag in response: '%s'", str(url))
238 # self.logger.debug("response.tags['media']: %s", str(response.tags["media"]))
239 # self.logger.debug("media tag in response: %s, type: %s" , str(response.tags["media"]["data"]), str(type(response.tags["media"]["data"])))
240  res = []
241  mediaData = []
242  if isinstance(response.tags["media"]["data"], basestring):
243  mediaData = [response.tags["media"]["data"]]
244  elif isinstance(response.tags["media"]["data"], list):
245  mediaData = list(set(response.tags["media"]["data"]))
246  else:
247  self.logger.error("!!! Wrong type of tag 'media': %s", str(type(response.tags["media"]["data"])))
248 
249  filter_patterns, filter_types = [], []
250  if self.input_data.filters:
251  # filter_types = [filter_item["Type"] for filter_item in self.input_data.filters]
252  # filter_patterns = [re.compile(filter_item["Pattern"]) for filter_item in self.input_data.filters]
253  filter_types = [filter_item.type for filter_item in self.input_data.filters]
254  filter_patterns = [re.compile(filter_item.pattern) for filter_item in self.input_data.filters]
255  # self.logger.debug("filter: %s", varDump(self.input_data.filters))
256 
257  for media in mediaData:
258  self.logger.debug("Media link: '%s'", media)
259  # instead pure url
260  if self.checkDOMElement(media):
261  res.append(media)
262  break
263 # media = urlparse.urljoin(url, media)
264  media = urlNormalization(self.baseUrl, media)
265 # self.logger.debug("media 2: %s", media)
266 
267  for filter_type, filter_pattern in zip(filter_types, filter_patterns):
268  match = filter_pattern.search(media)
269  if filter_type == SiteFilter.TYPE_EXCLUDE and match:
270  break
271  if filter_type == SiteFilter.TYPE_INCLUDE and match:
272  allowedUrls = self.checkMediaTag(media)
273  if len(allowedUrls) > 0:
274  res.append(','.join(allowedUrls))
275  break
276  else:
277  self.logger.debug("media: %s", media)
278  self.logger.debug("url: %s", url)
279  allowedUrls = self.checkMediaTag(media)
280  if len(allowedUrls) > 0:
281  res.append(','.join(allowedUrls))
282 
283  # If media tag after adjusting is empty remove it from response
284  if not len(res):
285  self.logger.debug("media tag is empty. Remove media tag from response.")
286  del response.tags["media"]
287  else:
288  self.logger.debug("media tag is adjusted. Copy media tag to response.")
289  response.tags["media"]["data"] = res
290  # End code block removing empty media tag
291 # else:
292 # self.logger.debug("resource hasn't template with media tag. adjustPartialReferences doesn't execute")
293  except Exception as err:
294  ExceptionLog.handler(self.logger, err, MSG_ERROR_ADJUST_PR, (), \
295  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
296 
297  else:
298  self.logger.debug(">>> Response has not have link or media tag, Don't need adjust media")
299 
300 
def urlNormalization(base, url, supportProtocols=None, log=None)
Definition: Utils.py:561
-mask-info
Definition: join.py:1
Here is the call graph for this function:
Here is the caller graph for this function:

◆ adjustTitle()

def dc_processor.Scraper.Scraper.adjustTitle (   self,
  response 
)

Definition at line 303 of file Scraper.py.

303  def adjustTitle(self, response):
304  try:
305  if self.input_data.template and "title" in self.input_data.template and "title" in response.tags:
306  self.logger.debug("resource has template with title tag. Try to adjust title.")
307  self.logger.debug("response.tags['title']: " + str(response.tags["title"]))
308  localExtractor = self.extractor
309  if localExtractor is None:
310  if len(self.extractors) > 2:
311  localExtractor = self.extractors[2]
312  else:
313  raise Exception(">>> Wrong! self.extractors list doesn't have 3'th element (index 2)")
314  if isinstance(response.tags["title"], basestring):
315  self.logger.debug("response has not have title tag")
316  sel = SelectorWrapper(text=self.input_data.raw_content)
317  title = sel.xpath("//title/text()").extract()
318  localExtractor.addTag(result=response, tag_name="title", tag_value=title)
319  self.logger.debug("TYPE response.tags['title']['data']" + str(type(response.tags["title"]["data"])))
320  else:
321  self.logger.debug("resource hasn't template with title tag. Don't need adjust title.")
322  except Exception as err:
323  ExceptionLog.handler(self.logger, err, MSG_ERROR_ADJUST_TITLE, (), \
324  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
325 
326 
Here is the caller graph for this function:

◆ applyHTTPRedirectLink()

def dc_processor.Scraper.Scraper.applyHTTPRedirectLink (   self,
  siteId,
  url,
  properties,
  response 
)

Definition at line 2160 of file Scraper.py.

2160  def applyHTTPRedirectLink(self, siteId, url, properties, response):
2161  if CONSTS.HTTP_REDIRECT_LINK_NAME in properties:
2162  self.logger.debug("Found property '%s'", str(CONSTS.HTTP_REDIRECT_LINK_NAME))
2163  propertyValue = int(properties[CONSTS.HTTP_REDIRECT_LINK_NAME])
2164 
2165  self.logger.debug("siteId: %s, url: %s, propertyValue: %s", str(siteId), str(url), str(propertyValue))
2166 # self.logger.debug("response: %s", varDump(response))
2167 
2168  headerContent = self.getHeaderContent(siteId, url)
2169  urlValue = self.getVariableFromHeaderContent(headerContent, CONSTS.LOCATION_NAME)
2170  self.logger.debug("%s value: %s", str(CONSTS.LOCATION_NAME), str(urlValue))
2171 
2172  if propertyValue == CONSTS.HTTP_REDIRECT_LINK_VALUE_URL:
2173  self.logger.debug("!!! propertyValue & %s", str(CONSTS.HTTP_REDIRECT_LINK_VALUE_URL))
2174 
2175  if CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME in response.tags and \
2176  "data" in response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME] and \
2177  len(response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME]["data"]) > 0:
2178  response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME]["data"][0] = url
2179 
2180  if urlValue is not None and propertyValue == CONSTS.HTTP_REDIRECT_LINK_VALUE_LOCATION:
2181  self.logger.debug("!!! propertyValue & %s", str(CONSTS.HTTP_REDIRECT_LINK_VALUE_LOCATION))
2182 
2183  if CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME in response.tags and \
2184  "data" in response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME] and \
2185  len(response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME]["data"]) > 0:
2186  response.tags[CONSTS.HTTP_REDIRECT_LINK_LINK_TAG_NAME]["data"][0] = str(urlValue)
2187 
2188  if urlValue is not None and propertyValue == CONSTS.HTTP_REDIRECT_LINK_VALUE_REDIRECT_URL:
2189  self.logger.debug("!!! propertyValue & %s", str(CONSTS.HTTP_REDIRECT_LINK_VALUE_REDIRECT_URL))
2190  self.addCustomTag(result=response, tag_name=CONSTS.REDIRECT_URL_NAME, tag_value=[str(urlValue)])
2191 
2192  if propertyValue == CONSTS.HTTP_REDIRECT_LINK_VALUE_SOURCE_URL:
2193  self.logger.debug("!!! propertyValue & %s", str(CONSTS.HTTP_REDIRECT_LINK_VALUE_SOURCE_URL))
2194 
2195  if urlValue is not None:
2196  self.addCustomTag(result=response, tag_name=CONSTS.REDIRECT_URL_NAME, tag_value=[str(urlValue)])
2197  else:
2198  self.addCustomTag(result=response, tag_name=CONSTS.REDIRECT_URL_NAME, tag_value=[url])
2199 
2200  return response
2201 
2202 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ applyPostProcessing()

def dc_processor.Scraper.Scraper.applyPostProcessing (   self,
  result,
  key,
  postProcessingRE 
)

Definition at line 1267 of file Scraper.py.

1267  def applyPostProcessing(self, result, key, postProcessingRE):
1268  if key in result.tags and "data" in result.tags[key] and result.tags[key]["data"] is not None and \
1269  len(result.tags[key]["data"]) > 0:
1270  try:
1271  matchingVal = re.compile(postProcessingRE) # #, re.UNICODE | re.MULTILINE)
1272  except re.error as err:
1273  self.logger.debug("Post-processing RE error: %s", str(err))
1274  self.errorMask = self.errorMask | APP_CONSTS.ERROR_RE_ERROR
1275  else:
1276  self.logger.debug("!!! type(result.tags[%s][\"data\"] = %s", str(key), type(result.tags[key]["data"]))
1277 
1278  tmpStr = ""
1279  matchingResult = []
1280  if isinstance(result.tags[key]["data"], basestring):
1281  matchingResult = matchingVal.findall(result.tags[key]["data"])
1282  elif isinstance(result.tags[key]["data"], list):
1283  # accumulate all results
1284  for tagData in result.tags[key]["data"]:
1285  self.logger.debug("!!! type(tagData) = %s, tagData: %s", str(type(tagData)), varDump(tagData))
1286  localRes = matchingVal.findall(tagData)
1287  matchingResult.extend(localRes)
1288 # match = re.search(postProcessingRE, tagData, re.U | re.M)
1289 # self.logger.debug("!!! match = %s, postProcessingRE = '%s'", str(match), str(postProcessingRE))
1290 # if match is not None:
1291 # matchingResult.append(str(match.group()))
1292 
1293  innerSplitString = '|||||'
1294  self.logger.debug("Post-processing has %s matched results!", str(len(matchingResult)))
1295  self.logger.debug("Post-processing matchingResult: %s", varDump(matchingResult))
1296  if len(matchingResult) > 0:
1297  for elem in matchingResult:
1298  if isinstance(elem, basestring):
1299  tmpStr += str(elem)
1300  tmpStr += self.xpathSplitString
1301  else:
1302  for innerElem in elem:
1303  if innerElem is not None and innerElem != '':
1304  tmpStr += str(innerElem)
1305  tmpStr += innerSplitString
1306  else:
1307  self.logger.debug("Post-processing has no matched results!")
1308 
1309  tmpStr = tmpStr.strip(self.xpathSplitString)
1310  if tmpStr != "":
1311  self.logger.debug("Post-processing matched and replaced with pieces!")
1312  self.logger.debug("!!! type(result.tags[%s][\"data\"])) = %s", str(key), str(type(result.tags[key]["data"])))
1313  self.logger.debug("!!! tmpStr: %s", varDump(tmpStr))
1314  if isinstance(result.tags[key]["data"], basestring):
1315  result.tags[key]["data"] = tmpStr
1316 # else:
1317 # result.tags[key]["data"][0] = tmpStr
1318  elif isinstance(result.tags[key]["data"], list):
1319  result.tags[key]["data"] = matchingResult # #tmpStr.split(innerSplitString)
1320  else:
1321  # Set not detected value if no match, changed default behavior by bgv
1322  self.logger.debug("Post-processing not matched, value replaced with None or empty!")
1323  if isinstance(result.tags[key]["data"], basestring):
1324  result.tags[key]["data"] = ''
1325  else:
1326  result.tags[key]["data"][0] = None
1327  else:
1328  self.logger.debug("Post-processing keys not found!")
1329 
1330 
def applyPostProcessing(self, result, key, postProcessingRE)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ applyPubdate()

def dc_processor.Scraper.Scraper.applyPubdate (   self,
  response,
  pubdate 
)

Definition at line 818 of file Scraper.py.

818  def applyPubdate(self, response, pubdate):
819  if isinstance(pubdate, SQLExpression) and str(pubdate) == "NOW()":
820  pubdate = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
821  else:
822  d = DateTimeType.parse(pubdate, bool(self.useCurrentYear), self.logger, False)
823  self.logger.debug("Check pubdate: '%s'", str(d))
824  if d is not None:
825  pubdate = d.strftime("%Y-%m-%d %H:%M:%S")
826  else:
827  pubdate = ''
828 
829  if "pubdate" in response.tags and "data" not in response.tags["pubdate"]:
830  response.tags["pubdate"]["data"] = []
831 
832  if "pubdate" in response.tags and "data" in response.tags["pubdate"]:
833  if len(response.tags["pubdate"]["data"]) > 0:
834  response.tags["pubdate"]["data"][0] = pubdate
835  else:
836  response.tags["pubdate"]["data"] = [pubdate]
837 
838  if "pubdate" not in response.tags:
839  self.addCustomTag(result=response, tag_name=CONSTS.TAG_PUB_DATE, tag_value=[pubdate])
840 
841 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ calcUrlDomainCrc()

def dc_processor.Scraper.Scraper.calcUrlDomainCrc (   self,
  url 
)

Definition at line 584 of file Scraper.py.

584  def calcUrlDomainCrc(self, url):
585  urlHost = None
586  auth = urlparse.urlsplit(url)[1]
587  if auth is not None:
588  urlHost = (re.search('([^@]*@)?([^:]*):?(.*)', auth).groups())[1]
589  if urlHost is not None and urlHost.find(self.WWW_PREFIX) == 0:
590  urlHost = urlHost[len(self.WWW_PREFIX): len(urlHost)]
591 
592  return urlHost
593 
594 
Here is the caller graph for this function:

◆ checkDOMElement()

def dc_processor.Scraper.Scraper.checkDOMElement (   self,
  elem 
)

Definition at line 203 of file Scraper.py.

203  def checkDOMElement(self, elem):
204  ret = False
205  if re.search('<', elem):
206  self.logger.debug("Media tag contain DOM element: %s", elem)
207  ret = True
208  return ret
209 
210 
Here is the caller graph for this function:

◆ checkMediaTag()

def dc_processor.Scraper.Scraper.checkMediaTag (   self,
  urlStringMedia 
)

Definition at line 2090 of file Scraper.py.

2090  def checkMediaTag(self, urlStringMedia):
2091  # variable for result
2092  allowedUrls = []
2093  # self.logger.debug("!!! urlStringMedia: %s", varDump(urlStringMedia))
2094  mediaUrls = self.splitMediaTagString(urlStringMedia)
2095  # self.logger.debug("!!! mediaUrls: %s", varDump(mediaUrls))
2096 
2097  for media in mediaUrls:
2098  # Check if media is binary picture
2099  if re.search(MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR, media, re.UNICODE) is not None:
2100  self.logger.debug("Tag 'media' has binary picture...")
2101 
2102  if self.mediaLimitsHandler is None:
2103  allowedUrls.append(media)
2104  else:
2105  if self.mediaLimitsHandler.isAllowedLimits(urlString=media, binaryType=True):
2106  allowedUrls.append(media)
2107  else:
2108  self.logger.debug("Binary media tag has not allowed limits. Skipped...")
2109 
2110  # Check is media content valid url
2111  elif isValidURL(media):
2112  self.logger.debug("Tag 'media' has valid url: %s", str(media))
2113  if self.mediaLimitsHandler is None:
2114  allowedUrls.append(media)
2115  else:
2116  if self.mediaLimitsHandler.isAllowedLimits(media):
2117  allowedUrls.append(media)
2118  else:
2119  self.logger.debug("Media tag has not allowed limits. Skipped. Url: %s", str(media))
2120 
2121  # Invalid url of 'media' tag
2122  else:
2123  self.logger.debug("Invalid url in tag 'media'... Url: %s", str(media))
2124 
2125  return allowedUrls
2126 
2127 
def isValidURL(url)
Definition: Utils.py:1637
Here is the call graph for this function:
Here is the caller graph for this function:

◆ commonResultOperations()

def dc_processor.Scraper.Scraper.commonResultOperations (   self,
  result 
)

Definition at line 1423 of file Scraper.py.

1423  def commonResultOperations(self, result):
1424  empty_tags = result.getEmptyTags()
1425  for localKey in EXTENDED_NEWS_TAGS:
1426  if localKey in empty_tags or (localKey in result.tags and result.isTagFilled(localKey) is False):
1427  self.extractAdditionTagsByScrapy(result, localKey, EXTENDED_NEWS_TAGS[localKey])
1428  for tagName in LINKS_NEWS_TAGS:
1429  if tagName in result.tags:
1430  if isinstance(result.tags[tagName], dict) and (result.tags[tagName]["xpath"] == "" or \
1431  result.tags[tagName]["xpath"].find("/@src") != -1 or result.tags[tagName]["xpath"].find("/@href") != -1):
1432  result.tags[tagName]["data"] = \
1433  self.dataUrlsCanonizator(result.tags[tagName]["data"], self.baseUrl)
1434 
1435  self.refineCommonText(CONSTS.TAG_CONTENT_UTF8_ENCODED, result)
1436  self.refineBadDateTags(result)
1437 
1438 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ compileResults()

def dc_processor.Scraper.Scraper.compileResults (   self,
  result,
  resultsList,
  key,
  xPathPreparing = None 
)

Definition at line 1035 of file Scraper.py.

1035  def compileResults(self, result, resultsList, key, xPathPreparing=None):
1036  for elem in resultsList:
1037  if key in result.tags:
1038  if result.tags[key] is not None:
1039  if result.tags[key]["xpath"] is None:
1040  result.tags[key]["xpath"] = elem["obj"].tags[key]["xpath"]
1041  else:
1042  result.tags[key]["xpath"] += ' '
1043  result.tags[key]["xpath"] += elem["obj"].tags[key]["xpath"]
1044  if result.tags[key]["data"] is None or len(result.tags[key]["data"]) == 0:
1045  result.tags[key]["data"] = elem["obj"].tags[key]["data"]
1046  else:
1047  if xPathPreparing is not None:
1048  self.xpathSplitString = xPathPreparing.resolveDelimiter(elem, self.properties, self.xpathSplitString)
1049  result.tags[key]["data"][0] += self.xpathSplitString
1050  else:
1051  result.tags[key]["data"][0] += ' '
1052  result.tags[key]["data"][0] += elem["obj"].tags[key]["data"][0]
1053  else:
1054  result.tags.update(elem["obj"].tags)
1055 
1056 
Here is the caller graph for this function:

◆ createArticle()

def dc_processor.Scraper.Scraper.createArticle (   self)

Definition at line 1854 of file Scraper.py.

1854  def createArticle(self):
1855  resid = self.entry["urlMd5"]
1856  self.article = Result(self.config, resid, self.metrics)
1857 
1858  for tag in self.entry["entry"]:
1859  data = {"extractor":"feedParser extractor", "data":"", "name":""}
1860  data["data"] = self.entry["entry"][tag]
1861  data["name"] = tag
1862  self.article.tags[tag] = data
1863 
1864  date_tags = ["published", "updated", "updated_parsed"]
1865  if len(set(self.entry["entry"].keys()).intersection(date_tags)) == 0:
1866  self.logger.debug("PUBDATE_ERROR: list of tags from rss feed: %s" % str(self.entry["entry"].keys()))
1867 
1868  if "pubdate" in self.entry and self.article.tags["pubdate"] == "":
1869  data = {"extractor":"feedParser extractor", "data":"", "name":""}
1870  data["data"] = self.entry["pubdate"]
1871  data["name"] = "pubdate"
1872  self.article.tags["pubdate"] = data
1873 
1874  # parent rss feed
1875  data = {"extractor":"feedParser extractor", "data":"", "name":""}
1876  data["data"] = self.entry["parent_rss_feed"]
1877  data["name"] = "parent_rss_feed"
1878  data["xpath"] = ""
1879  data["extractor"] = self.__class__.__name__
1880  self.article.tags["parent_rss_feed"] = data
1881 
1882  # parent rss feed urlMd5
1883  data = {"extractor":"feedParser extractor", "data":"", "name":""}
1884  data["data"] = self.entry["parent_rss_feed_urlMd5"]
1885  data["name"] = "parent_rss_feed_urlMd5"
1886  data["xpath"] = ""
1887  data["extractor"] = self.__class__.__name__
1888  self.article.tags["parent_rss_feed_urlMd5"] = data
1889 
1890  # tags count
1891  self.article.tagsCount = len(self.article.tags.keys())
1892 
1893 
Here is the caller graph for this function:

◆ createModule()

def dc_processor.Scraper.Scraper.createModule (   self,
  module_name 
)

Definition at line 1794 of file Scraper.py.

1794  def createModule(self, module_name):
1795  appInst = None
1796  try:
1797 # appInst = (module_name, eval(module_name)(self.config, None, self.urlHost, self.properties))[1] # pylint: disable=W0123
1798  appInst = (module_name, eval(module_name)(self.config,
1799  None,
1800  self.getDomainsForUrlSourcesRules(self.urlSourcesRules),
1801  self.properties))[1]
1802  self.logger.debug("%s has been created!" % module_name)
1803  except Exception as err:
1804  ExceptionLog.handler(self.logger, err, "Can't create module %s. Error is:" % (module_name))
1805 
1806  return appInst
1807 
1808 
def createModule(self, module_name)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ dataUrlsCanonizator()

def dc_processor.Scraper.Scraper.dataUrlsCanonizator (   self,
  data,
  baseUrl = None,
  useAdditionEncoding = False 
)

Definition at line 1129 of file Scraper.py.

1129  def dataUrlsCanonizator(self, data, baseUrl=None, useAdditionEncoding=False):
1130  ret = data
1131  # self.logger.debug(">>> url canonizator = " + str(data))
1132  if isinstance(data, basestring):
1133  ret = self.elemUrlsCanoizator(data, baseUrl, useAdditionEncoding=useAdditionEncoding)
1134  elif isinstance(data, list):
1135  ret = []
1136  for elem in data:
1137  elem = self.elemUrlsCanoizator(elem, baseUrl, useAdditionEncoding=useAdditionEncoding)
1138  ret.append(elem)
1139  return ret
1140 
1141 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ elemUrlsCanoizator()

def dc_processor.Scraper.Scraper.elemUrlsCanoizator (   self,
  data,
  baseUrl = None,
  firstDelim = ' ',
  secondDelim = ',
  useAdditionEncoding = False 
)

Definition at line 1101 of file Scraper.py.

1101  def elemUrlsCanoizator(self, data, baseUrl=None, firstDelim=' ', secondDelim=',', useAdditionEncoding=False):
1102  normMask = UrlNormalizator.NORM_NONE
1103  if "URL_NORMALIZE_MASK_PROCESSOR" in self.properties:
1104  normMask = int(self.properties["URL_NORMALIZE_MASK_PROCESSOR"])
1105 
1106  ret = data
1107  if data.strip() != "":
1108  ret = ""
1109  for elem in data.split(firstDelim):
1110  if elem.strip() != "":
1111  localUrl = elem
1112  if baseUrl is not None:
1113 # localUrl = urlparse.urljoin(baseUrl, localUrl)
1114  localUrl = urlNormalization(baseUrl, localUrl)
1115  processedUrl = dc_event.URL(0, localUrl, normalizeMask=normMask).getURL(normMask)
1116  if useAdditionEncoding:
1117  processedUrl = xml.sax.saxutils.escape(processedUrl, {})
1118  ret += processedUrl + secondDelim
1119  if ret != "" and ret[-1] == secondDelim:
1120  ret = ret[0: len(ret) - 1]
1121  return ret
1122 
1123 
def urlNormalization(base, url, supportProtocols=None, log=None)
Definition: Utils.py:561
Here is the call graph for this function:
Here is the caller graph for this function:

◆ extractAdditionTagsByScrapy()

def dc_processor.Scraper.Scraper.extractAdditionTagsByScrapy (   self,
  localResult,
  key,
  tagsXpaths 
)

Definition at line 1493 of file Scraper.py.

1493  def extractAdditionTagsByScrapy(self, localResult, key, tagsXpaths):
1494  self.logger.debug(">>> Start addition news extracting")
1495  extractor = self.getExtractorByName("ScrapyExtractor")
1496  if extractor is not None:
1497  sel = SelectorWrapper(text=self.input_data.raw_content)
1498  for tagsXpath in tagsXpaths:
1499  if tagsXpath is not None and tagsXpath != "":
1500  localXpath = sel.xpath(tagsXpath)
1501  localValue = Utils.innerText(localXpath, ' ', ' ', self.properties[CONSTS.TAG_MARKUP_PROP_NAME] \
1502  if CONSTS.TAG_MARKUP_PROP_NAME in self.properties else None, None,
1503  self.attrConditions)
1504  if localValue != "":
1505  extractor.addTag(localResult, key, localValue, tagsXpath)
1506  break
1507  else:
1508  self.logger.debug(">>> Cant extract tag=%s for xpath=%s" % (key, tagsXpath))
1509 
1510 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ extractBaseUrlRssFeed()

def dc_processor.Scraper.Scraper.extractBaseUrlRssFeed (   self,
  siteId,
  url 
)

Definition at line 1965 of file Scraper.py.

1965  def extractBaseUrlRssFeed(self, siteId, url):
1966  # variable for result
1967  ret = None
1968 
1969  self.logger.debug("!!! siteId: %s, url: %s", str(siteId), str(url))
1970  headerContent = self.getHeaderContent(siteId, url)
1971  if headerContent is not None:
1972  ret = self.getVariableFromHeaderContent(headerContent, CRAWLER_CONSTS.baseUrlHeaderName)
1973 
1974  self.logger.debug('!!! ret: ' + str(ret))
1975 
1976  return ret
1977 
1978 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ extractFeedUrlRssFeed()

def dc_processor.Scraper.Scraper.extractFeedUrlRssFeed (   self,
  siteId,
  url 
)

Definition at line 1946 of file Scraper.py.

1946  def extractFeedUrlRssFeed(self, siteId, url):
1947  # variable for result
1948  ret = None
1949 
1950  self.logger.debug("!!! siteId: %s, url: %s", str(siteId), str(url))
1951  headerContent = self.getHeaderContent(siteId, url)
1952  if headerContent is not None:
1953  ret = self.getVariableFromHeaderContent(headerContent, CRAWLER_CONSTS.rssFeedUrlHeaderName)
1954 
1955  self.logger.debug('!!! ret: ' + str(ret))
1956 
1957  return ret
1958 
1959 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ extractPubDate()

def dc_processor.Scraper.Scraper.extractPubDate (   self,
  response,
  dataTagName 
)

Definition at line 468 of file Scraper.py.

468  def extractPubDate(self, response, dataTagName):
469  # variable for result
470  ret = None
471  timezone = ''
472  try:
473  if response is not None and dataTagName in response.tags and response.tags[dataTagName] is not None:
474 
475  # self.logger.debug("extractPubDate response: " + varDump(response))
476 
477  inputData = response.tags[dataTagName]["data"]
478  self.logger.debug("extractPubDate response has '" + str(dataTagName) + "' is: " + str(inputData))
479  self.logger.debug("extractPubDate type of '" + str(dataTagName) + "' is: " + str(type(inputData)))
480 
481  inputList = []
482  if isinstance(inputData, basestring):
483  inputList = [inputData]
484  elif isinstance(inputData, list):
485  inputList = inputData
486  else:
487  pass
488 
489  pubdate = []
490  timezones = []
491  for inputElem in inputList:
492  d = DateTimeType.parse(inputElem, bool(self.useCurrentYear), self.logger, False)
493  self.logger.debug('pubdate: ' + str(d))
494 
495  if d is not None:
496  d, tzone = DateTimeType.split(d)
497  pubdate.append(d.isoformat(DateTimeType.ISO_SEP))
498  timezones.append(tzone)
499 
500  self.logger.debug("extractPubDate result pubdate: " + str(pubdate))
501  response.tags[dataTagName]["data"] = pubdate
502  if len(pubdate) > 0:
503  ret = pubdate[0]
504 
505  if len(timezones) > 0:
506  timezone = timezones[0]
507 
508  except Exception, err:
509  ExceptionLog.handler(self.logger, err, 'extractPubDate error:', (), \
510  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
511 
512  return ret, timezone
513 
514 
Here is the caller graph for this function:

◆ extractPubdateRssFeed()

def dc_processor.Scraper.Scraper.extractPubdateRssFeed (   self,
  siteId,
  url 
)

Definition at line 1916 of file Scraper.py.

1916  def extractPubdateRssFeed(self, siteId, url):
1917  # variable for result
1918  pubdate = None
1919  timezone = ''
1920 
1921  self.logger.debug("!!! siteId: %s, url: %s", str(siteId), str(url))
1922  headerContent = self.getHeaderContent(siteId, url)
1923  rawPubdate = self.getVariableFromHeaderContent(headerContent, CRAWLER_CONSTS.pubdateRssFeedHeaderName)
1924 
1925 # self.logger.debug('!!! getVariableFromHeaderContent: ' + str(rawPubdate))
1926  if rawPubdate is not None:
1927  try:
1928  dt = DateTimeType.parse(rawPubdate, True, self.logger, False)
1929  if dt is not None:
1930  dt, timezone = DateTimeType.split(dt)
1931  pubdate = dt.strftime("%Y-%m-%d %H:%M:%S")
1932 
1933  if timezone is '':
1934  timezone = '+0000'
1935  except Exception, err:
1936  self.logger.debug("Unsupported date format: '%s', error: %s", str(rawPubdate), str(err))
1937 
1938  return pubdate, timezone
1939 
1940 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ feedParserProcess()

def dc_processor.Scraper.Scraper.feedParserProcess (   self)

Definition at line 1832 of file Scraper.py.

1832  def feedParserProcess(self):
1833  self.logger.debug("URL: %s" % str(self.input_data.url))
1834  self.logger.debug("URLMd5: %s" % str(self.input_data.urlId))
1835  self.logger.debug("SiteId: %s" % str(self.input_data.siteId))
1836  if self.parseFeed():
1837  self.tagsCount = self.article.tagsCount
1838  self.tagsMask = self.article.tagsMask
1839  self.processedContent = self.article.get()
1840  # correct pubdate
1841  if CONSTS.PUBLISHED in self.article.tags:
1842  # self.pubdate = parse(self.article.tags[CONSTS.PUBLISHED]["data"]).strftime(CONSTS.COMMON_DATE_FORMAT)
1843  self.pubdate = DateTimeType.parse(self.article.tags[CONSTS.PUBLISHED]["data"], bool(self.useCurrentYear), \
1844  self.logger)
1845  else:
1846  self.logger.debug("Resource %s hasn't publish date" % str(self.article.tags[CONSTS.TAG_LINK]["data"]))
1847  else:
1848  self.logger.debug("Resource hasn't raw content. Exit.")
1849 
1850 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ formatOutpuElement()

def dc_processor.Scraper.Scraper.formatOutpuElement (   self,
  elem,
  localOutputFormat 
)

Definition at line 851 of file Scraper.py.

851  def formatOutpuElement(self, elem, localOutputFormat):
852  ret = elem
853  if localOutputFormat == "json":
854  localStr = json.dumps(elem, ensure_ascii=False)
855 
856  if len(localStr) > 0:
857  if localStr[0] == '\"' or localStr[0] == '\'':
858  localStr = localStr[1:]
859  if localStr[-1] == '\"' or localStr[-1] == '\'':
860  localStr = localStr[0:-1]
861 
862  ret = localStr
863  elif localOutputFormat == "html" or localOutputFormat == "xml":
864  ret = xml.sax.saxutils.escape(elem, {"'": "&apos;", "\"" : "&quot;"})
865  elif localOutputFormat == "sql":
866  # ret = mdb.escape_string(elem) # pylint: disable=E1101
867  ret = Utils.escape(elem)
868 
869  return ret
870 
871 
def formatOutpuElement(self, elem, localOutputFormat)
Here is the caller graph for this function:

◆ formatOutputData()

def dc_processor.Scraper.Scraper.formatOutputData (   self,
  response,
  localOutputFormat 
)

Definition at line 872 of file Scraper.py.

872  def formatOutputData(self, response, localOutputFormat):
873  for key in response.tags:
874  if "data" in response.tags[key]:
875  if isinstance(response.tags[key]["data"], list):
876  for i, elem in enumerate(response.tags[key]["data"]):
877  if len(response.tags[key]["data"]) > i:
878  response.tags[key]["data"][i] = self.formatOutpuElement(elem, localOutputFormat)
879 
880  elif isinstance(response.tags[key]["data"], str) or isinstance(response.tags[key]["data"], unicode):
881  response.tags[key]["data"] = self.formatOutpuElement(response.tags[key]["data"], localOutputFormat)
882 
883 
def formatOutputData(self, response, localOutputFormat)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ formatTag()

def dc_processor.Scraper.Scraper.formatTag (   self,
  result,
  path,
  key,
  pathDict,
  isExtract 
)

Definition at line 1144 of file Scraper.py.

1144  def formatTag(self, result, path, key, pathDict, isExtract):
1145  # Andrey Add
1146  self.logger.debug("Tag name: '%s', tag type: %s, tag format: '%s'",
1147  str(key), str(path["type"]), str(path["format"]))
1148  # Add End
1149  if path["type"] == "text":
1150  localText = ''
1151  for elem in result.tags[key]["data"]:
1152  localText += (elem.strip() + self.xpathSplitString)
1153  localText = localText.strip(self.xpathSplitString)
1154  localMaxCh = None
1155  if "format" in pathDict and "maxCh" in pathDict["format"]:
1156  localMaxCh = pathDict["format"]["maxCh"]
1157  self.logger.debug("!!! get localMaxCh from pathDict[\"format\"][\"maxCh\"] = %s", str(localMaxCh))
1158  else:
1159  localMaxCh = path["format"]
1160  if isinstance(localMaxCh, basestring) and localMaxCh == "":
1161  localMaxCh = 0
1162  self.logger.debug("!!! get localMaxCh from [\"format\"] = %s", str(localMaxCh))
1163 
1164  try:
1165  if localMaxCh is not None and int(localMaxCh) > 0 and len(localText) > int(localMaxCh):
1166  localText = localText[0: int(localMaxCh)]
1167  except ValueError, err:
1168  self.logger.debug("!!! Use wrong value, error: %s", str(err))
1169 
1170  result.tags[key]["data"] = []
1171  result.tags[key]["data"].append(localText)
1172  elif path["type"] == "html":
1173  # >>> html
1174  for i, elem in enumerate(result.tags[key]["data"]):
1175  result.tags[key]["data"][i] = re.sub(r"<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)>", "", elem)
1176  self.logger.debug(">>> After RE = " + str(result.tags[key]["data"]))
1177 # # # apply post processing algorithm
1178 # self.postprocessing(result, path, key)
1179  # >>> html END
1180  elif path["type"] == "datetime":
1181  # >>> datetime
1182  bestData = ''
1183  try:
1184  self.logger.debug("Try to convert data")
1185  if not isExtract:
1186  # New use default value as a format string for current date
1187  if len(result.tags[key]["data"][0]) > 0 and result.tags[key]["data"][0][0] == '@':
1188  localFormatStr = result.tags[key]["data"][0][1: len(result.tags[key]["data"][0])]
1189  localTm = datetime.datetime.fromtimestamp(time.time())
1190  result.tags[key]["data"][0] = datetime.datetime.strftime(localTm, localFormatStr)
1191  else:
1192  bestData = self.getBestDatatimeData(result.tags[key]["data"])
1193  self.logger.debug(">>> Time log Before = " + bestData)
1194  if path["format"] != "" and path["format"] != "FULL":
1195  result.tags[key]["data"][0] = datetime.datetime.strftime(parser.parse(bestData), path["format"])
1196  else:
1197  result.tags[key]["data"][0] = str(parser.parse(bestData))
1198  self.logger.debug(">>> Time log after = " + result.tags[key]["data"][0])
1199  except Exception as err:
1200  self.logger.debug("Can't convert data <<< " + str(result.tags) + " " + str(key) + " err = " + str(err))
1201  result.tags[key]["data"][0] = bestData
1202  if len(result.tags[key]["data"]) > 0:
1203  result.tags[key]["data"] = [result.tags[key]["data"][0]]
1204  # >>> datetime END
1205  elif path["type"] == "image":
1206  if path["format"] == "URL" and "canonicalizeURLs" in path and int(path["canonicalizeURLs"]) == 1:
1207  result.tags[key]["data"] = self.dataUrlsCanonizator(result.tags[key]["data"], self.baseUrl)
1208  elif path["type"] == "link":
1209  formatName = path["format"]
1210  if len(formatName.split(',')) > 1:
1211  formatName = formatName.split(',')[1]
1212  if formatName == "email-address" or formatName == "email-to":
1213  localText = ''
1214  if isinstance(result.tags[key]["data"], basestring):
1215  self.logger.debug(">>> mail to str type")
1216  localText = result.tags[key]["data"].strip(self.xpathSplitString)
1217  index = localText.find("mailto:")
1218  if index >= 0:
1219  localText = localText[index + len("mailto:"), len(localText)]
1220  else:
1221  localText = ""
1222  elif isinstance(result.tags[key]["data"], list):
1223  self.logger.debug(">>> mail to list type")
1224  for elem in result.tags[key]["data"]:
1225  elemText = elem.strip(self.xpathSplitString)
1226  index = elemText.find("mailto:")
1227  if index >= 0:
1228  elemText = elemText[index + len("mailto:"): len(elemText)]
1229  if formatName == "email-address":
1230  elemText = Utils.emailParse(elemText)
1231  else:
1232  elemText = Utils.emailParse(elemText, True)
1233  else:
1234  elemText = ""
1235  if elemText != "":
1236  localText += (elemText + self.xpathSplitString)
1237 
1238  result.tags[key]["data"] = []
1239  result.tags[key]["data"].append(localText)
1240  if "canonicalizeURLs" in path and int(path["canonicalizeURLs"]) == 1:
1241  result.tags[key]["data"] = self.dataUrlsCanonizator(result.tags[key]["data"], self.baseUrl)
1242  elif path["type"] == "attribute":
1243  if isExtract:
1244  localText = ''
1245  if isinstance(result.tags[key]["data"], basestring):
1246  localText = result.tags[key]["data"]
1247  elif isinstance(result.tags[key]["data"], list):
1248  localText = self.xpathSplitString.join([elem for elem in result.tags[key]["data"] if elem != ''])
1249  splittedFormatString = path["format"].split(',')
1250  if len(splittedFormatString) >= 2:
1251  try:
1252  if int(splittedFormatString[0]) < len(localText):
1253  localText = localText[0: int(splittedFormatString[0])]
1254  except Exception as err:
1255  self.logger.debug("Error: %s; Wrong path format for attribute rule, format=%s", str(err), path["format"])
1256  result.tags[key]["data"] = []
1257  result.tags[key]["data"].append(localText)
1258 
1259  localElem = ''
1260  for elem in result.tags[key]["data"]:
1261  localElem += elem
1262  localElem += self.xpathSplitString
1263  result.tags[key]["data"][0] = localElem
1264  result.tags[key]["data"][0] = result.tags[key]["data"][0].strip(self.xpathSplitString)
1265 
1266 
Definition: join.py:1
Here is the call graph for this function:
Here is the caller graph for this function:

◆ getBestDatatimeData()

def dc_processor.Scraper.Scraper.getBestDatatimeData (   self,
  data 
)

Definition at line 1344 of file Scraper.py.

1344  def getBestDatatimeData(self, data):
1345  ret = ""
1346  if isinstance(data, list):
1347  for elem in data:
1348  for ch in elem:
1349  if ch >= '0' and ch <= '9':
1350  ret = elem
1351  break
1352  if ret is not None:
1353  break
1354  if ret is None:
1355  ret = data[0]
1356  else:
1357  ret = data
1358  if isinstance(ret, basestring):
1359  ret = ret.replace('\n', '')
1360  ret = ret.replace('\t', '')
1361  else:
1362  ret = ""
1363  return ret
1364 
1365 
Here is the caller graph for this function:

◆ getDomainsForUrlSourcesRules()

def dc_processor.Scraper.Scraper.getDomainsForUrlSourcesRules (   self,
  urlSourcesRules 
)

Definition at line 2207 of file Scraper.py.

2207  def getDomainsForUrlSourcesRules(self, urlSourcesRules):
2208  self.logger.debug("Incoming value urlSourcesRules: %s", varDump(urlSourcesRules))
2209  # variable for result
2210  domains = []
2211 
2212  for urlSourcesRule in urlSourcesRules:
2213  if urlSourcesRule == URL_SOURCES_RULE_DATA_URL:
2214  self.logger.debug("dataUrl: %s", str(self.input_data.url))
2215  self.logger.debug("urlHost: %s", str(self.urlHost))
2216 
2217  domain = self.calcUrlDomainCrc(self.input_data.url)
2218  self.logger.debug("domain: %s", str(domain))
2219 
2220  if domain is not None:
2221  domains.append(domain)
2222 
2223  if urlSourcesRule == URL_SOURCES_RULE_REDIRECT_URL:
2224  headerContent = self.getHeaderContent(self.input_data.siteId, self.input_data.url)
2225  redirectUrl = self.getVariableFromHeaderContent(headerContent, CONSTS.LOCATION_NAME)
2226  self.logger.debug("redirectUrl: %s", str(redirectUrl))
2227 
2228  if isinstance(redirectUrl, basestring):
2229  domain = self.calcUrlDomainCrc(redirectUrl)
2230  self.logger.debug("domain: %s", str(domain))
2231 
2232  if domain is not None:
2233  domains.append(domain)
2234 
2235  if urlSourcesRule == URL_SOURCES_RULE_FEED_URL:
2236  feedUrl = self.extractFeedUrlRssFeed(self.input_data.siteId, self.input_data.url)
2237  self.logger.debug("feedUrl: %s", str(feedUrl))
2238 
2239  if isinstance(feedUrl, basestring):
2240  domain = self.calcUrlDomainCrc(feedUrl)
2241  self.logger.debug("domain: %s", str(domain))
2242 
2243  if domain is not None:
2244  domains.append(domain)
2245 
2246  if len(domains) == 0:
2247  domains.append(self.urlHost)
2248 
2249  self.logger.debug("return domains: %s", varDump(domains))
2250 
2251  return domains
2252 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ getExitCode()

def dc_processor.Scraper.Scraper.getExitCode (   self)

Definition at line 1823 of file Scraper.py.

1823  def getExitCode(self):
1824  return self.exitCode
1825 

◆ getExtractorByName()

def dc_processor.Scraper.Scraper.getExtractorByName (   self,
  extractorName 
)

Definition at line 1814 of file Scraper.py.

1814  def getExtractorByName(self, extractorName):
1815  for extractor in self.extractors:
1816  if extractor.__class__.__name__ == extractorName:
1817  return extractor
1818 
1819 
def getExtractorByName(self, extractorName)
Here is the caller graph for this function:

◆ getHeaderContent()

def dc_processor.Scraper.Scraper.getHeaderContent (   self,
  siteId,
  url 
)

Definition at line 1984 of file Scraper.py.

1984  def getHeaderContent(self, siteId, url):
1985  # variable for result
1986  headerContent = None
1987  urlContentObj = dc_event.URLContentRequest(siteId, url, \
1988  dc_event.URLContentRequest.CONTENT_TYPE_RAW_LAST + \
1989  dc_event.URLContentRequest. CONTENT_TYPE_RAW + \
1990  dc_event.URLContentRequest.CONTENT_TYPE_HEADERS)
1991 
1992  rawContentData = self.dbWrapper.urlContent([urlContentObj])
1993 
1994  if rawContentData is not None and len(rawContentData) > 0:
1995  if rawContentData[0].headers is not None and len(rawContentData[0].headers) > 0 and \
1996  rawContentData[0].headers[0] is not None:
1997  headerContent = rawContentData[0].headers[0].buffer
1998 
1999  return headerContent
2000 
2001 
Here is the caller graph for this function:

◆ getNextBestExtractor()

def dc_processor.Scraper.Scraper.getNextBestExtractor (   self)

Definition at line 1511 of file Scraper.py.

1511  def getNextBestExtractor(self):
1512  # return extractor with highest rank
1513  try:
1514  extractor = next(self.itr)
1515  except StopIteration:
1516  extractor = None
1517  return extractor
1518 
1519 
Here is the caller graph for this function:

◆ getProcessedContent()

def dc_processor.Scraper.Scraper.getProcessedContent (   self,
  result 
)

Definition at line 1522 of file Scraper.py.

1522  def getProcessedContent(self, result):
1523  for elem in result:
1524  elem.get()
1525 
1526 # self.logger.info("!!! result[0].tags[\"content_encoded\"][\"data\"][0]: %s",
1527 # str(result[0].tags["content_encoded"]["data"][0]))
1528 
1529 # if "content_encoded" in result[0].tags and "data" in result[0].tags["content_encoded"] and \
1530 # len(result[0].tags["content_encoded"]["data"]) > 0:
1531 # result[0].tags["content_encoded"]["data"][0] = result[0].tags["content_encoded"]["data"][0].replace('\\n', '\n')
1532 
1533  self.processedContent = {}
1534  self.processedContent["default"] = result[0]
1535  self.processedContent["internal"] = result
1536  self.processedContent["custom"] = []
1537  self.tagsCount = result[0].tagsCount
1538  self.tagsMask = result[0].tagsMask
1539 
1540 # #TODO remove in future ## checked now
1541  if "pubdate" in result[0].tags and "data" in result[0].tags["pubdate"] and \
1542  len(result[0].tags["pubdate"]["data"]) > 0:
1543  self.pubdate = result[0].tags["pubdate"]["data"][0]
1544  self.logger.debug('>>>> Set self.pubdate = ' + str(self.pubdate))
1545  self.input_data.batch_item.urlObj.pDate = self.pubdate
1546 
1547 
Here is the caller graph for this function:

◆ getTemplate()

def dc_processor.Scraper.Scraper.getTemplate (   self,
  explicit = True 
)

Definition at line 884 of file Scraper.py.

884  def getTemplate(self, explicit=True):
885  if isinstance(self.input_data.template, dict):
886  template = self.input_data.template
887  else:
888  # template = ast.literal_eval(self.input_data.template)
889  # TODO:strange potential backdoor for malicious code, cancelled by bgv
890  if explicit:
891  self.logger.error("Wrong template structure: `%s` but dict expected, assumed empty!",
892  str(type(self.input_data.template)))
893  self.logger.debug("Template:\n%s", str(self.input_data.template))
894  template = {}
895 
896  return template
897 
898 
-mask-info
Here is the caller graph for this function:

◆ getVariableFromHeaderContent()

def dc_processor.Scraper.Scraper.getVariableFromHeaderContent (   self,
  headerContent,
  name,
  makeDecode = True 
)

Definition at line 2008 of file Scraper.py.

2008  def getVariableFromHeaderContent(self, headerContent, name, makeDecode=True):
2009  # variable for result
2010  ret = None
2011 
2012  header = ''
2013  if isinstance(headerContent, basestring):
2014  if makeDecode:
2015  header = base64.b64decode(headerContent)
2016  else:
2017  header = headerContent
2018 
2019  headerList = header.split('\r\n')
2020  self.logger.debug("headerList: " + varDump(headerList))
2021 
2022  for elem in headerList:
2023  pos = elem.find(name + ':')
2024 # self.logger.debug("!!! name: '%s', pos = %s", str(name), str(pos))
2025  if pos > -1:
2026  ret = elem.replace(name + ':', '').strip()
2027  self.logger.debug("Found '" + name + "' has value: " + str(ret))
2028  break
2029 
2030  return ret
2031 
2032 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ loadConfig()

def dc_processor.Scraper.Scraper.loadConfig (   self)

Definition at line 1699 of file Scraper.py.

1699  def loadConfig(self):
1700  try:
1701  self.config = ConfigParser.ConfigParser()
1702  self.config.optionxform = str
1703  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1704  if self.pargs.config:
1705  self.config.read(self.pargs.config)
1706  else:
1707  self.config.read(APP_NAME)
1708  else:
1709  self.config.read(self.configFile)
1710  except:
1711  print MSG_ERROR_LOAD_CONFIG
1712  raise
1713 
1714 
Here is the caller graph for this function:

◆ loadExtractors()

def dc_processor.Scraper.Scraper.loadExtractors (   self)

Definition at line 1550 of file Scraper.py.

1550  def loadExtractors(self):
1551  try:
1552  # modules
1553  if CONSTS.MODULES_KEY in self.properties and self.algorithm_name in self.properties[CONSTS.MODULES_KEY]:
1554  modules = self.properties[CONSTS.MODULES_KEY][self.algorithm_name]
1555  else:
1556  self.logger.debug(">>> No moduler_key or algorithm_name in self.properties")
1557  modules = []
1558 
1559  self.logger.debug("Algorithm name: <%s>" % (self.algorithm_name))
1560  self.logger.debug("Modules: %s" % modules)
1561 
1562  self.extractors = []
1563  for module in modules:
1564  exrtactor = self.createModule(module)
1565  # Check if module was created successfully and then insert it to extractors
1566  if exrtactor is not None:
1567  self.extractors.append(exrtactor)
1568 
1569  # Info show extractors loaded
1570  self.logger.debug("*******************")
1571  self.logger.debug("Loaded extractors:")
1572  for exrtactor in self.extractors:
1573  self.logger.debug(exrtactor.name)
1574  self.logger.debug("*******************")
1575 
1576  except Exception as err:
1577  ExceptionLog.handler(self.logger, err, MSG_ERROR_LOAD_EXTRACTORS)
1578  raise
1579 
1580 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ loadLogConfigFile()

def dc_processor.Scraper.Scraper.loadLogConfigFile (   self)

Definition at line 1718 of file Scraper.py.

1718  def loadLogConfigFile(self):
1719  try:
1720  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1721  log_conf_file = self.config.get("Application", "log")
1722  logging.config.fileConfig(log_conf_file)
1723  # Logger initialization
1724  self.logger = Utils.MPLogger().getLogger()
1725  except Exception, err:
1726  raise Exception(CONSTS.MSG_ERROR_LOAD_CONFIG + " : " + str(err))
1727 
1728 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ loadOptions()

def dc_processor.Scraper.Scraper.loadOptions (   self)

Definition at line 1732 of file Scraper.py.

1732  def loadOptions(self):
1733  try:
1734  class_name = self.__class__.__name__
1735  self.scraperPropFileName = self.config.get("Application", "property_file_name")
1736  # DBWrapper initialization
1737  dbTaskIniConfigFileName = self.config.get(self.__class__.__name__, "db-task_ini")
1738  config = ConfigParser.ConfigParser()
1739  config.optionxform = str
1740  readOk = config.read(dbTaskIniConfigFileName)
1741  if len(readOk) == 0:
1742  raise Exception(self.MSG_ERROR_WRONG_CONFIG_FILE_NAME + ": " + dbTaskIniConfigFileName)
1743  self.dbWrapper = DBTasksWrapper(config)
1744 
1745  # url sources rules initialization
1746  urlSourcesList = self.config.get(self.__class__.__name__, OPTION_SECTION_URL_SOURCES_RULES)
1747  if isinstance(urlSourcesList, basestring):
1748  self.urlSourcesRules = [urlSourcesRule.strip() for urlSourcesRule in urlSourcesList.split(',')]
1749  self.logger.debug("Initialization urlSourcesRules: %s", varDump(self.urlSourcesRules))
1750 
1751  self.sqliteTimeout = self.config.getint("sqlite", "timeout")
1752 
1753  self.useCurrentYear = self.config.getint("DateTimeType", "useCurrentYear")
1754 
1755  self.tagsTypes = self.config.get(class_name, OPTION_SECTION_TAGS_TYPE)
1756 
1757  if self.config.has_section(OPTION_SECTION_DATETIME_NEWS_NAMES):
1758  self.datetimeNewsNames = []
1759  for item in self.config.items(OPTION_SECTION_DATETIME_NEWS_NAMES):
1760  self.datetimeNewsNames.append(item[0])
1761  else:
1762  self.logger.debug("Config file hasn't section: " + str(OPTION_SECTION_DATETIME_NEWS_NAMES))
1763  self.datetimeNewsNames = TAGS_DATETIME_NEWS_NAMES
1764 
1765  if self.config.has_section(OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
1766  self.datetimeTemplateTypes = []
1767  for item in self.config.items(OPTION_SECTION_DATETIME_TEMPLATE_TYPES):
1768  self.datetimeTemplateTypes.append(item[0])
1769  else:
1770  self.logger.debug("Config file hasn't section: " + str(OPTION_SECTION_DATETIME_TEMPLATE_TYPES))
1771  self.datetimeTemplateTypes = TAGS_DATETIME_TEMPLATE_TYPES
1772  except:
1773  print MSG_ERROR_LOAD_OPTIONS
1774  raise
1775 
1776 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ loadScraperProperties()

def dc_processor.Scraper.Scraper.loadScraperProperties (   self)

Definition at line 1779 of file Scraper.py.

1779  def loadScraperProperties(self):
1780  if self.scraperPropFileName is not None:
1781  try:
1782  with open(self.scraperPropFileName, "rb") as fd:
1783  scraperProperies = json.loads(fd.read())
1784  self.properties = scraperProperies[self.__class__.__name__][CONSTS.PROPERTIES_KEY]
1785  except Exception as excp:
1786  self.logger.debug(">>> Some error with scraper property loads = " + str(excp))
1787 
1788 
Here is the caller graph for this function:

◆ newsExtraction()

def dc_processor.Scraper.Scraper.newsExtraction (   self)

Definition at line 1366 of file Scraper.py.

1366  def newsExtraction(self):
1367  ret = []
1368 
1369  template = self.getTemplate(explicit=False)
1370 
1371  # get resource as dictionary
1372  resource_set = {}
1373  resource_set["url"] = self.input_data.url
1374  resource_set["resId"] = self.input_data.urlId
1375  resource_set["siteId"] = self.input_data.siteId
1376  resource_set["raw_html"] = self.input_data.raw_content
1377  resource = Resource(resource_set)
1378 
1379  collectResult = Result(self.config, self.input_data.urlId, self.metrics)
1380  blockedByXpathTags = []
1381 
1382  while True:
1383  self.extractor = self.getNextBestExtractor()
1384  self.logger.debug("Got best matching extractor: " + str(self.extractor))
1385  if self.extractor is None:
1386  self.logger.debug("No more extractors, exiting loop")
1387  break
1388 
1389  result = Result(self.config, self.input_data.urlId, self.metrics)
1390 
1391  if CONSTS.TAG_MEDIA in collectResult.tags.keys() and \
1392  not self.extractor.isTagNotFilled(collectResult, CONSTS.TAG_MEDIA):
1393  self.logger.debug("!!! Check collectResult. Tag 'media' already selected. Copy.")
1394  result.tags[CONSTS.TAG_MEDIA] = collectResult.tags[CONSTS.TAG_MEDIA]
1395 
1396  result.blockedByXpathTags = blockedByXpathTags
1397  self.logger.debug(">>> TAG BEGIN extractor = " + str(self.extractor))
1398  result = self.extractor.extractTags(resource, result)
1399 
1400  self.logger.debug(">>> TAG END")
1401  empty_tags = result.getEmptyTags()
1402  self.logger.debug("get list of empty tags from result: " + str(empty_tags))
1403  filled_tags = result.getFilledTags()
1404  self.logger.debug("get list of filled_tags from result: " + str(filled_tags))
1405 
1406  self.commonResultOperations(result)
1407  for tag in result.tags:
1408  if tag in template:
1409  for rule in template[tag]:
1410  self.postprocessing(result, rule, tag)
1411  if tag not in collectResult.tags or not collectResult.isTagFilled(tag):
1412  collectResult.tags[tag] = copy.deepcopy(result.tags[tag])
1413  blockedByXpathTags = result.blockedByXpathTags
1414  result.finish = time.time()
1415  ret.append(result)
1416 
1417  collectResult.blockedByXpathTags = blockedByXpathTags
1418  ret = [collectResult] + ret
1419 
1420  return ret
1421 
1422 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ normalizeAuthor()

def dc_processor.Scraper.Scraper.normalizeAuthor (   self,
  confProp,
  procProp,
  response 
)

Definition at line 366 of file Scraper.py.

366  def normalizeAuthor(self, confProp, procProp, response):
367  try:
368  if response is not None and response.tags is not None:
369  # self.logger.debug("normalizeAuthor scraper response: " + varDump(response))
370 
371  if self.input_data.template and self.algorithm_name != CONSTS.PROCESS_ALGORITHM_REGULAR:
372  if AuthorType.MAIN_TAG_NAME in response.tags and response.tags[AuthorType.MAIN_TAG_NAME] is not None and \
373  "data" in response.tags[AuthorType.MAIN_TAG_NAME]:
374  inputData = response.tags[AuthorType.MAIN_TAG_NAME]["data"]
375  self.logger.debug("normalizeAuthor response has '" + str(AuthorType.MAIN_TAG_NAME) + "' is: " + \
376  str(inputData))
377  self.logger.debug("normalizeAuthor type of '" + str(AuthorType.MAIN_TAG_NAME) + "' is: " + \
378  str(type(inputData)))
379 
380  inputList = []
381  if isinstance(inputData, str) or isinstance(inputData, unicode):
382  inputList = [inputData]
383  elif isinstance(inputData, list):
384  inputList = inputData
385  else:
386  pass
387 
388  self.logger.debug("normalizeAuthor confProp: " + varDump(confProp))
389  self.logger.debug("normalizeAuthor procProp: " + varDump(procProp))
390 
391  authors = []
392  for inputElem in inputList:
393  author = AuthorType.parse(confProp, procProp, inputElem, self.logger)
394  if author is not None:
395  authors.append(author)
396 
397  self.logger.debug("normalizeAuthor result author: " + str(authors))
398  if len(authors) > 0:
399  response.tags[AuthorType.MAIN_TAG_NAME]["data"] = authors
400 
401  except Exception, err:
402  ExceptionLog.handler(self.logger, err, 'normalizeAuthor error:', (), \
403  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
404 
405 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ normalizeDatetime()

def dc_processor.Scraper.Scraper.normalizeDatetime (   self,
  response,
  algorithmName 
)

Definition at line 411 of file Scraper.py.

411  def normalizeDatetime(self, response, algorithmName):
412  ret = None
413  timezone = ''
414  try:
415  if response is not None and response.tags is not None:
416  # self.logger.debug("normalizeDatetime scraper response: " + varDump(response))
417  tagNames = []
418  if self.input_data.template and algorithmName == CONSTS.PROCESS_ALGORITHM_REGULAR:
419  # temlate
420  for responseType in self.datetimeTemplateTypes:
421  for responseTagName in response.tags:
422  self.logger.debug("normalizeDatetime responseTagName: '" + str(responseTagName) + "'")
423  if (response.tags.get(responseTagName) is not None and \
424  'type' in response.tags[responseTagName] and \
425  response.tags[responseTagName]['type'] == responseType) or \
426  (responseTagName == CONSTS.TAG_PUB_DATE and response.tags.get(responseTagName) is not None):
427  tagNames.append(responseTagName)
428  else:
429  # dynamic
430  tagNames = self.datetimeNewsNames
431 
432  self.logger.debug('normalizeDatetime tagNames: ' + varDump(tagNames))
433  retDict = {}
434  for tagName in tagNames:
435  pubdate, tzone = self.extractPubDate(response, tagName)
436  if self.extractor and tagName in response.tags:
437  self.extractor.addTag(result=response, tag_name=tagName + '_normalized', tag_value=pubdate, \
438  xpath=response.tags[tagName]['xpath'])
439 
440  self.logger.debug('tagName: ' + str(tagName) + ' pubdate: ' + str(pubdate))
441  retDict[tagName] = pubdate
442 
443  if tagName == CONSTS.TAG_PUB_DATE:
444  ret = pubdate
445  timezone = tzone
446  else:
447  pass
448 
449  if ret is None:
450  for key, value in retDict.items():
451  if value is not None:
452  ret = value
453  self.logger.debug('set return value from ' + str(key) + ' : ' + str(value))
454  break
455 
456  except Exception, err:
457  ExceptionLog.handler(self.logger, err, 'normalizeDatetime error:', (), \
458  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
459 
460  return ret, timezone
461 
462 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ parseFeed()

def dc_processor.Scraper.Scraper.parseFeed (   self)

Definition at line 1897 of file Scraper.py.

1897  def parseFeed(self):
1898  ret = True
1899  try:
1900  self.entry = json.loads(self.input_data.raw_content)
1901  self.createArticle()
1902  self.putArticleToDB({"default":self.article}) # pylint: disable=E1101
1903  except ValueError, err:
1904  ExceptionLog.handler(self.logger, err, 'Bad raw content:', (self.input_data.raw_content), \
1905  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
1906  ret = False
1907 
1908  return ret
1909 
1910 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ postprocessing()

def dc_processor.Scraper.Scraper.postprocessing (   self,
  result,
  rule,
  tag 
)

Definition at line 899 of file Scraper.py.

899  def postprocessing(self, result, rule, tag):
900  self.logger.debug("!!! rule: '%s'", varDump(rule))
901  if rule.get('postProcessing') is not None and rule["postProcessing"] != "":
902  self.logger.debug("Post-processing applied for tag `%s` with expression: %s",
903  str(tag), str(rule["postProcessing"]))
904  self.applyPostProcessing(result, tag, rule["postProcessing"])
905  else:
906  self.logger.debug("Post-processing is not applied for tag `%s`", str(tag))
907 
908 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ prepareResults()

def dc_processor.Scraper.Scraper.prepareResults (   self,
  resultsList 
)

Definition at line 1057 of file Scraper.py.

1057  def prepareResults(self, resultsList):
1058  ret = []
1059  if len(resultsList) > 0:
1060  localElemWeight = 0
1061  firstElemWeight = 0
1062  firstElem = None
1063  tempList = []
1064  for elem in resultsList:
1065  localElemWeight = 0
1066  if elem["join"] == "concat":
1067  tempList.append(elem)
1068  else:
1069  if elem["mandatory"]:
1070  #>>> Mandatory breaking block -------------
1071  if not elem["isExtract"]:
1072  return []
1073  #-------------
1074  localElemWeight = localElemWeight | CONSTS.TAGS_RULES_MASK_MANDATORY_FIELD
1075  if elem["join"] == "best":
1076  localElemWeight = localElemWeight | CONSTS.TAGS_RULES_MASK_RULE_PRIORITY
1077  if elem["isExtract"]:
1078  localElemWeight = localElemWeight | CONSTS.TAGS_RULES_MASK_DEFAULT_VALUE
1079 
1080  self.logger.debug(">>> Rule weight = " + str(localElemWeight))
1081  self.logger.debug(">>> Rule join = " + elem["join"])
1082  if localElemWeight > firstElemWeight:
1083  firstElemWeight = localElemWeight
1084  firstElem = elem
1085 
1086  if firstElem is not None:
1087  tempList = [firstElem] + tempList
1088  isExtractResults = any([elem["isExtract"] for elem in tempList])
1089  if isExtractResults:
1090  ret = [elem for elem in tempList if elem["isExtract"]]
1091  elif len(tempList) > 0:
1092  ret.append(tempList[0])
1093  return ret
1094 
1095 
Here is the caller graph for this function:

◆ preparseResponse()

def dc_processor.Scraper.Scraper.preparseResponse (   self,
  response 
)

Definition at line 842 of file Scraper.py.

842  def preparseResponse(self, response):
843  for key in response.tags:
844  if "data" in response.tags[key]:
845  if isinstance(response.tags[key]["data"], basestring):
846  localStr = response.tags[key]["data"]
847  response.tags[key]["data"] = []
848  response.tags[key]["data"].append(localStr)
849 
850 
Here is the caller graph for this function:

◆ process()

def dc_processor.Scraper.Scraper.process (   self,
  config 
)

Definition at line 599 of file Scraper.py.

599  def process(self, config):
600  # info input data
601  self.logger.info("input_data url: %s, urlId: %s, siteId: %s", str(self.input_data.url), str(self.input_data.urlId),
602  str(self.input_data.siteId))
603 
604  self.baseUrl = self.extractBaseUrlRssFeed(self.input_data.siteId, self.input_data.url)
605  if self.baseUrl is None:
606  self.baseUrl = self.input_data.url
607 
608  if self.input_data.template and self.algorithm_name == CONSTS.PROCESS_ALGORITHM_REGULAR:
609  # Reconfigure processor's properties to involve only template scraper
610  responses = self.templateExtraction(config, self.urlHost)
611  else:
612  # get iterator to ranked list of extractors
613  self.itr = iter(sorted(self.extractors, key=lambda extractor: 0, reverse=True))
614  self.logger.debug("Extractors: %s" % varDump(self.itr))
615  responses = self.newsExtraction()
616 
617  if CONSTS.MEDIA_LIMITS_NAME in self.input_data.batch_item.properties:
618  self.logger.debug("Found property '%s'", str(CONSTS.MEDIA_LIMITS_NAME))
619  self.mediaLimitsHandler = MediaLimitsHandler(self.input_data.batch_item.properties[CONSTS.MEDIA_LIMITS_NAME])
620 
621  for response in responses:
622  response.metricsPrecalculate()
623  response.stripResult()
624  # Add tag 'source_url'
625  self.addCustomTag(result=response, tag_name=CONSTS.TAG_SOURCE_URL, \
626  tag_value=str(self.input_data.url))
627 
628  #self.logger.debug("self.properties: %s", varDump(self.properties))
629  if CONSTS.LANG_PROP_NAME in self.properties:
630  self.logger.debug("!!! Enter '%s' !!!", str(CONSTS.LANG_PROP_NAME))
631 
632  langDetector = ScraperLangDetector(self.properties[CONSTS.LANG_PROP_NAME])
633  langDetector.process(response, self.logger)
634  langTagsDict = langDetector.getLangTags()
635  self.logger.debug("langTagsDict: %s", varDump(langTagsDict))
636 
637 # # self.logger.debug("!!! self.input_data.batch_item.properties = %s, type = %s", varDump(self.input_data.batch_item.properties), str(type(self.input_data.batch_item.properties)))
638 # #
639 # # if 'template' in self.input_data.batch_item.properties and \
640 # # 'templates' in self.input_data.batch_item.properties['template'] and \
641 # # len(self.input_data.batch_item.properties['template']['templates']) > 0 and \
642 # # 'output_format' in self.input_data.batch_item.properties['template']['templates'][0] and \
643 # # 'item' in self.input_data.batch_item.properties['template']['templates'][0]['output_format']:
644 # # itemString = self.input_data.batch_item.properties['template']['templates'][0]['output_format']['item']
645 # # self.logger.debug("itemString: %s:", str(itemString))
646 # # try:
647 # # jsonDict = json.loads(itemString, encoding='utf-8')
648 # # self.logger.debug("jsonDict: %s:", varDump(jsonDict))
649 # # for tagName, langValue in langTagsDict.items():
650 # # jsonDict[tagName] = langValue
651 # #
652 # # self.input_data.batch_item.properties['template']['templates'][0]['output_format']['item'] = \
653 # # json.dumps(jsonDict, ensure_ascii=False, encoding='utf-8')
654 # # except Exception, err:
655 # # self.logger.error(str(err))
656 # # self.logger.info(Utils.getTracebackInfo())
657 
658  # add lang tags to processed content
659  for tagName, langValue in langTagsDict.items():
660  self.addCustomTag(result=response, tag_name=tagName, tag_value=langValue)
661 
662  summaryLang = langDetector.getSummaryLang(response, self.logger)
663  self.addCustomTag(result=response, tag_name=CONSTS.TAG_SUMMARY_LANG, tag_value=summaryLang)
664  self.logger.debug("!!! Leave '%s' !!!", str(CONSTS.LANG_PROP_NAME))
665 
666  # put extracted article to the db
667 
668  if self.algorithm_name != CONSTS.PROCESS_ALGORITHM_REGULAR:
669  self.adjustTitle(response)
670  self.adjustLinkURL(response)
671  self.adjustPartialReferences(response)
672 
673  # self.logger.debug("CONSTS.TAG_PUB_DATE response: " + varDump(response))
674 
675  self.preparseResponse(response)
676 
677  # Improvement author
678  tagsTypes = None
679  if CONSTS.TAGS_TYPES_NAME in self.input_data.batch_item.properties:
680  tagsTypes = self.input_data.batch_item.properties[CONSTS.TAGS_TYPES_NAME]
681 
682  self.logger.info('=' * 50)
683  self.logger.info('self.properties: ' + varDump(self.properties))
684 
685  self.normalizeAuthor(self.tagsTypes, tagsTypes, response)
686 
687  # Setting pubdate in depend of different sources masks
688  # default values
689  pdateSourceMask = APP_CONSTS.PDATE_SOURCES_MASK_BIT_DEFAULT
690  pdateSourceMaskOverwrite = APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_DEFAULT
691 
692  # get value 'PDATE_SOURCES_MASK' from site properties
693  if APP_CONSTS.PDATE_SOURCES_MASK_PROP_NAME in self.input_data.batch_item.properties:
694  pdateSourceMask = int(self.input_data.batch_item.properties[APP_CONSTS.PDATE_SOURCES_MASK_PROP_NAME])
695 
696  # get value 'PDATE_SOURCES_MASK_OVERWRITE' from site properties
697  if APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_PROP_NAME in self.input_data.batch_item.properties:
698  pdateSourceMaskOverwrite = \
699  int(self.input_data.batch_item.properties[APP_CONSTS.PDATE_SOURCES_MASK_OVERWRITE_PROP_NAME])
700 
701  self.logger.debug('pdateSourceMask = %s, pdateSourceMaskOverwrite = %s',
702  str(pdateSourceMask), str(pdateSourceMaskOverwrite))
703 
704  self.logger.debug("!!! self.input_data.batch_item.urlObj.pDate = " + str(self.input_data.batch_item.urlObj.pDate))
705 
706  timezone = ''
707  # URL object the "pdate" field (supposed was got from the RSS feed)
708  if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED:
709  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED) or \
710  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_RSS_FEED:
711  self.pubdate, timezone = self.extractPubdateRssFeed(self.input_data.siteId, self.input_data.url)
712 
713  # Normalization procedure after the scraping, supposes the tag dc_date for the NEWS or TEMPLATE scraping.
714  if CONSTS.TAG_DC_DATE in response.tags and pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE:
715  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE and self.pubdate is None) or \
716  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_DC_DATE:
717  if CONSTS.TAG_PUB_DATE not in response.tags or \
718  (isinstance(response.tags[CONSTS.TAG_PUB_DATE]["data"], basestring) and \
719  response.tags[CONSTS.TAG_PUB_DATE]["data"].strip() == ""):
720  response.tags[CONSTS.TAG_PUB_DATE] = copy.deepcopy(response.tags[CONSTS.TAG_DC_DATE])
721  response.tags[CONSTS.TAG_PUB_DATE]["name"] = CONSTS.TAG_PUB_DATE
722  if len(response.tags[CONSTS.TAG_PUB_DATE]["data"]) > 0 and response.tags[CONSTS.TAG_PUB_DATE]["data"][0]:
723  self.pubdate = response.tags[CONSTS.TAG_PUB_DATE]["data"][0]
724  self.logger.debug("Pubdate from 'dc_date': " + str(self.pubdate))
725  # Check format
726  d = DateTimeType.parse(self.pubdate, bool(self.useCurrentYear), self.logger, False)
727  self.logger.debug('Check format pubdate: ' + str(d))
728  if d is not None:
729  d, timezone = DateTimeType.split(d)
730  self.pubdate = d.isoformat(DateTimeType.ISO_SEP)
731  self.logger.debug("Result pubdate from 'dc_date': %s, timezone: %s", str(self.pubdate), str(timezone))
732  else:
733  self.pubdate = ''
734 
735  # Normalization procedure after the scraping, supposes the "pubdate" tag for the NEWS or TEMPLATE scraping.
736  if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE:
737  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE and self.pubdate is None) or \
738  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_PUBDATE:
739  pubdate, tzone = self.normalizeDatetime(response, self.algorithm_name)
740  if pubdate is not None:
741  self.pubdate = pubdate
742  timezone = tzone
743  self.logger.debug("Pubdate from 'pubdate': " + str(self.pubdate) + " timezone: " + str(timezone))
744 
745  # Current date (SQL NOW())
746  if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_NOW:
747  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_NOW and self.pubdate is None) or \
748  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_NOW:
749  self.pubdate = SQLExpression("NOW()") # pylint: disable=R0204
750  self.logger.debug("Pubdate from 'SQL NOW()': " + str(self.pubdate))
751 
752  # Custom SQL expression defined in the property PDATE_SOURCES_EXPRESSION
753  if pdateSourceMask & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION and \
754  APP_CONSTS.PDATE_SOURCES_EXPRESSION_PROP_NAME in self.properties:
755  if (pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION and self.pubdate is None) or \
756  not pdateSourceMaskOverwrite & APP_CONSTS.PDATE_SOURCES_MASK_SQL_EXPRESSION:
757  self.pubdate = SQLExpression(str(self.properties[APP_CONSTS.PDATE_SOURCES_EXPRESSION_PROP_NAME]))
758  self.logger.debug("Pubdate from 'sql expression': " + str(self.pubdate))
759 
760  # Apply property 'PDATE_DAY_MONTH_ORDER'
761  self.pubdate = self.pubdateMonthOrder(self.pubdate, self.input_data.batch_item.properties, self.input_data.url)
762 
763  # Apply property 'PDATE_TIME'
764 # self.input_data.batch_item.urlObj.pDate = self.pubdate
765  self.pubdate = FieldsSQLExpressionEvaluator.evaluatePDateTime(self.input_data.batch_item.properties,
766  self.dbWrapper,
767  self.input_data.batch_item.urlObj,
768  self.logger,
769  self.pubdate)
770 
771  # Apply property 'PDATE_TIMEZONES'
772  self.pubdate, timezone = self.pubdateTransform(self.pubdate,
773  timezone,
774  self.input_data.batch_item.properties,
775  self.input_data.url)
776 
777  # Add tag 'pubdate_tz'
778  self.addCustomTag(result=response, tag_name=CONSTS.TAG_PUBDATE_TZ, tag_value=[timezone])
779 
780  self.logger.debug("!!! self.pubdate: %s", str(self.pubdate))
781 # self.logger.debug("!!! response.tags: %s", varDump(response.tags))
782 
783  # apply content of 'pubdate' before formatOutputData
784  self.applyPubdate(response, self.pubdate)
785 
786  # Add tag 'feed_url'
787  feedUrl = self.extractFeedUrlRssFeed(self.input_data.siteId, self.input_data.url)
788  if feedUrl is not None:
789  self.addCustomTag(result=response, tag_name=CONSTS.TAG_FEED_URL, tag_value=[feedUrl])
790 
791  # self.logger.debug("!!! response: %s", varDump(response))
792 
793  if self.outputFormat is None:
794  self.logger.debug(">>> Warning, can't extract output format")
795  else:
796  self.formatOutputData(response, self.outputFormat)
797 
798  response.recalcTagMaskCount(None, self.altTagsMask)
799  self.tagsCount = response.tagsCount
800  self.tagsMask = response.tagsMask
801  # self.putArticleToDB({"default":response})
802  self.logger.debug("self.tagsCount: " + str(self.tagsCount) + " self.tagsMasks: " + str(self.tagsMask))
803 
804  response.finish = time.time()
805  response.data["time"] = "%s" % (response.finish - response.start)
806 
807  response = self.applyHTTPRedirectLink(self.input_data.batch_item.siteId, self.input_data.batch_item.urlObj.url,
808  self.input_data.batch_item.properties, response)
809 
810  self.getProcessedContent(responses)
811 
812 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ processBatch()

def dc_processor.Scraper.Scraper.processBatch (   self)

Definition at line 1583 of file Scraper.py.

1583  def processBatch(self):
1584  # logger
1585  for entry in self.message_queue:
1586  self.logger.debug(entry)
1587 
1588  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1589  # read pickled batch object from stdin
1590  input_pickled_object = sys.stdin.read()
1591 
1592  try:
1593  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1594  scraper_in_data = pickle.loads(input_pickled_object)
1595  except Exception as err:
1596  ExceptionLog.handler(self.logger, err, 'pickle.loads() error:')
1597  self.logger.debug("input_pickled_object:\n" + str(input_pickled_object))
1598  self.exitCode = EXIT_FAILURE
1599  raise Exception(err)
1600 
1601  try:
1602  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1603  self.input_data = scraper_in_data
1604  if self.input_data.batch_item.urlObj is not None:
1605  urlString = self.input_data.batch_item.urlObj.url
1606  else:
1607  urlString = ""
1608  logMsg = "BatchItem.siteId=" + str(self.input_data.batch_item.siteId) + \
1609  ", BatchItem.urlId=" + str(self.input_data.batch_item.urlId) + \
1610  ", BatchItem.urlObj.url=" + urlString
1611  app.Profiler.messagesList.append(logMsg)
1612  self.logger.info("Incoming data: %s", logMsg)
1613  # self.logger.debug("self.input_data:\n%s", varDump(self.input_data))
1614  self.urlHost = self.calcUrlDomainCrc(self.input_data.url)
1615 
1616  if self.input_data.output_format is not None and "name" in self.input_data.output_format:
1617  self.outputFormat = self.input_data.output_format["name"]
1618 
1619  if self.outputFormat is None and "templates" in self.input_data.batch_item.properties["template"] and \
1620  len(self.input_data.batch_item.properties["template"]["templates"]) > 0 and \
1621  "output_format" in self.input_data.batch_item.properties["template"]["templates"][0] and \
1622  "name" in self.input_data.batch_item.properties["template"]["templates"][0]["output_format"]:
1623  self.outputFormat = self.input_data.batch_item.properties["template"]["templates"][0]["output_format"]["name"]
1624 
1625  if "TAGS_MAPPING" in self.input_data.batch_item.properties and \
1626  self.input_data.batch_item.properties["TAGS_MAPPING"] is not None:
1627  try:
1628  self.altTagsMask = json.loads(self.input_data.batch_item.properties["TAGS_MAPPING"])
1629  self.logger.debug(">>> AltTags = " + str(self.altTagsMask))
1630  except Exception as exp:
1631  self.logger.debug(">>> Bad TAGS_MAPPING properties value, err=" + str(exp))
1632  # check properties in input data
1633  try:
1634  if (self.input_data is not None) and (self.input_data.processor_properties is not None):
1635  processor_properties = self.input_data.processor_properties
1636  # self.logger.debug("Processor's properties was taken from input data: %s" % processor_properties)
1637  # self.logger.debug("Processor's properties type: %s" % str(type(processor_properties)))
1638  if not isinstance(processor_properties, dict):
1639  processor_properties = json.loads(self.input_data.processor_properties)
1640  self.logger.debug("Processor's properties was taken from input data: %s" % processor_properties)
1641  self.properties.update(processor_properties)
1642  except Exception as err:
1643  ExceptionLog.handler(self.logger, err, 'Error load properties from input data:')
1644 
1645  self.algorithm_name = self.properties[CONSTS.ALGORITHM_KEY][CONSTS.ALGORITHM_NAME_KEY]
1646  self.logger.debug("Algorithm : %s" % self.algorithm_name)
1647  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1648  Utils.storePickleOnDisk(input_pickled_object, ENV_SCRAPER_STORE_PATH, "scraper.in." + \
1649  str(self.input_data.urlId))
1650  if "metrics" in self.properties:
1651  try:
1652  self.metrics = json.loads(self.properties["metrics"])
1653  self.logger.debug(">>> Metrics loads = " + str(self.metrics))
1654  except Exception as excp:
1655  self.logger.debug(">>> Metrcis dumps exception = " + str(excp))
1656  # TODO main processing over every url from list of urls in the batch object
1657  tmp = sys.stdout
1658  sys.stdout = open("/dev/null", "wb")
1659 
1660  # initialization of scraper
1661  # load scraper's modules
1662  self.loadExtractors()
1663 
1664  self.logger.info("Process with extractor algorithm: " + str(self.algorithm_name))
1665  # SUPPORT METRICS ALGORITHTM
1666  # if self.algorithm_name == CONSTS.PROCESS_ALGORITHM_METRIC:
1667  # self.processMetrics()
1668  # SUPPORT FEED_PARSER ALGORITHTM
1669  if self.algorithm_name == CONSTS.PROCESS_ALGORITHM_FEED_PARSER:
1670  self.feedParserProcess()
1671  else:
1672  self.process(self.config)
1673 
1674  # send response to the stdout
1675  sys.stdout = tmp
1676 
1677  scraperResponse = ScraperResponse(self.tagsCount, self.tagsMask, self.pubdate, self.processedContent,
1678  self.errorMask)
1679 # self.logger.debug("scraperResponse:\n%s", varDump(scraperResponse))
1680 
1681  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
1682  output_pickled_object = pickle.dumps(scraperResponse)
1683  Utils.storePickleOnDisk(output_pickled_object, ENV_SCRAPER_STORE_PATH,
1684  "scraper.out." + str(self.input_data.urlId))
1685  print output_pickled_object
1686  sys.stdout.flush()
1687  else:
1688  self.output_data = scraperResponse
1689 
1690  except Exception as err:
1691  ExceptionLog.handler(self.logger, err, 'Scraper process batch error:')
1692  self.exitCode = EXIT_FAILURE
1693  raise Exception('Scraper process batch error:' + str(err))
1694 
1695 
1696 
Here is the caller graph for this function:

◆ processingHTMLData()

def dc_processor.Scraper.Scraper.processingHTMLData (   self,
  htmlBuf,
  bufFormat 
)

Definition at line 1331 of file Scraper.py.

1331  def processingHTMLData(self, htmlBuf, bufFormat):
1332  ret = htmlBuf
1333  if bufFormat.find("NO_SCRIPT") >= 0:
1334  ret = Utils.stripHTMLComments(htmlBuf, soup=None)
1335  if bufFormat.find("NO_META") >= 0:
1336  pass
1337  if bufFormat.find("NO_COMMENTS") >= 0:
1338  pass
1339  if bufFormat.find("ENTITIES_ENCODED") >= 0:
1340  pass
1341  return ret
1342 
1343 

◆ pubdateMonthOrder()

def dc_processor.Scraper.Scraper.pubdateMonthOrder (   self,
  rawPubdate,
  properties,
  urlString 
)

Definition at line 2039 of file Scraper.py.

2039  def pubdateMonthOrder(self, rawPubdate, properties, urlString):
2040  # variables for result
2041  pubdate = rawPubdate
2042 
2043  self.logger.debug('pubdateMonthOrder() enter... rawPubdate: ' + str(rawPubdate))
2044  if CONSTS.PDATE_DAY_MONTH_ORDER_NAME in properties and isinstance(rawPubdate, basestring):
2045  propertyObj = []
2046  try:
2047  self.logger.debug('inputted ' + CONSTS.PDATE_DAY_MONTH_ORDER_NAME + ':' + \
2048  str(properties[CONSTS.PDATE_DAY_MONTH_ORDER_NAME]))
2049  propertyObj = json.loads(properties[CONSTS.PDATE_DAY_MONTH_ORDER_NAME])
2050  except Exception, err:
2051  self.logger.error("Fail loads '%s', error: %s", str(CONSTS.PDATE_DAY_MONTH_ORDER_NAME), str(err))
2052 
2053  for propertyElem in propertyObj:
2054  try:
2055  if "pattern" not in propertyElem:
2056  raise Exception('Property "pattern" not found')
2057 
2058  if "order" not in propertyElem:
2059  raise Exception('Property "order" not found')
2060 
2061  pattern = str(propertyElem["pattern"])
2062  order = int(propertyElem["order"])
2063 
2064  if re.search(pattern, urlString, re.UNICODE) is not None:
2065  self.logger.debug("Pattern '%' found in url: %s", str(pattern), str(urlString))
2066 
2067  dt = None
2068  if order == 0: # means day follows month
2069  dt = datetime.datetime.strptime(rawPubdate, "%Y-%d-%m %H:%M:%S")
2070  elif order == 1: # means month follows day
2071  dt = datetime.datetime.strptime(rawPubdate, "%Y-%m-%d %H:%M:%S")
2072  else:
2073  raise Exception("Unsupported value of 'order' == " + str(order))
2074 
2075  if dt is not None:
2076  pubdate = dt.strftime("%Y-%d-%m %H:%M:%S")
2077 
2078  except Exception, err:
2079  self.logger.error("Fail execution '%s', error: %s", str(CONSTS.PDATE_DAY_MONTH_ORDER_NAME), str(err))
2080 
2081  self.logger.debug('pubdateMonthOrder() leave... pubdate: ' + str(pubdate))
2082 
2083  return pubdate
2084 
2085 
-mask-info
Here is the caller graph for this function:

◆ pubdateTransform()

def dc_processor.Scraper.Scraper.pubdateTransform (   self,
  rawPubdate,
  rawTimezone,
  properties,
  urlString 
)

Definition at line 522 of file Scraper.py.

522  def pubdateTransform(self, rawPubdate, rawTimezone, properties, urlString):
523  # variables for result
524  pubdate = rawPubdate
525  timezone = rawTimezone
526 
527  # self.logger.debug('properties: ' + varDump(properties))
528  if CONSTS.PDATE_TIMEZONES_NAME in properties:
529  propertyString = properties[CONSTS.PDATE_TIMEZONES_NAME]
530  self.logger.debug('inputted ' + CONSTS.PDATE_TIMEZONES_NAME + ':' + str(propertyString))
531 
532  dt = DateTimeType.parse(rawPubdate, bool(self.useCurrentYear), self.logger, False)
533  self.logger.debug('pubdate: ' + str(dt))
534  if dt is not None:
535  # get utc offset if necessary
536  utcOffset = DateTimeType.extractUtcOffset(rawTimezone, self.logger)
537  self.logger.debug('utcOffset: ' + str(utcOffset))
538  # transformation accord to PDATE_TIMEZONES properties
539  d = PDateTimezonesHandler.transform(dt, utcOffset, propertyString, urlString, self.logger)
540  if d is not None:
541  dt = d
542 
543  if dt is not None:
544  d, tzone = DateTimeType.split(dt)
545  pubdate = d.isoformat(DateTimeType.ISO_SEP)
546  timezone = tzone
547 
548  return pubdate, timezone
549 
550 
Here is the caller graph for this function:

◆ refineBadDateTags()

def dc_processor.Scraper.Scraper.refineBadDateTags (   self,
  response 
)

Definition at line 553 of file Scraper.py.

553  def refineBadDateTags(self, response):
554  removeKeys = []
555  for key in response.tags:
556  if key in DATA_NEWS_TAGS:
557  tagsValue = None
558 
559  if isinstance(response.tags[key], basestring):
560  tagsValue = response.tags[key]
561  elif isinstance(response.tags[key], dict) and "data" in response.tags[key]:
562  if isinstance(response.tags[key]["data"], basestring):
563  tagsValue = response.tags[key]["data"]
564  elif isinstance(response.tags[key]["data"], list) and len(response.tags[key]["data"]) > 0 and \
565  isinstance(response.tags[key]["data"][0], basestring):
566  tagsValue = response.tags[key]["data"][0]
567 
568  if tagsValue is not None:
569  try:
570  dt = parser.parse(tagsValue)
571  int(time.mktime(dt.timetuple()))
572  except Exception:
573  removeKeys.append(key)
574 
575  for key in removeKeys:
576  if key in response.tags:
577  logging.debug(">>> Remove " + key + " element besause it empty")
578  del response.tags[key]
579 
580 
Here is the caller graph for this function:

◆ refineCommonText()

def dc_processor.Scraper.Scraper.refineCommonText (   self,
  tagName,
  result 
)

Definition at line 1448 of file Scraper.py.

1448  def refineCommonText(self, tagName, result):
1449  if tagName in result.tags:
1450  if isinstance(result.tags[tagName], dict):
1451  localValue = None
1452  if isinstance(result.tags[tagName]["data"], list) and len(result.tags[tagName]["data"]) > 0:
1453  localValue = result.tags[tagName]["data"][0]
1454  elif isinstance(result.tags[tagName]["data"], basestring):
1455  localValue = result.tags[tagName]["data"]
1456  if localValue is not None:
1457  replaceList = None
1458  if CONSTS.TAG_REDUCE_PROP_NAME in self.properties:
1459  try:
1460  replaceList = json.loads(self.properties[CONSTS.TAG_REDUCE_PROP_NAME])
1461  except Exception:
1462  self.logger.debug(">>> Bad processor_property json format, [" + CONSTS.TAG_REDUCE_PROP_NAME + "]")
1463  if replaceList is None:
1464  replaceList = CONTENT_REPLACEMENT_LIST # json.loads(CONTENT_REPLACEMENT)
1465 
1466  if CONSTS.TAG_REDUCE_MASK_PROP_NAME in self.properties:
1467  try:
1468  self.tagReduceMask = int(self.properties[CONSTS.TAG_REDUCE_MASK_PROP_NAME])
1469  except Exception:
1470  self.logger.error("Bad processor property '%s' value: '%s'", CONSTS.TAG_REDUCE_MASK_PROP_NAME,
1471  str(self.properties[CONSTS.TAG_REDUCE_MASK_PROP_NAME]))
1472 
1473  self.logger.debug("self.tagReduceMask = %s", str(self.tagReduceMask))
1474 # self.logger.debug("replaceList: %s", str(replaceList))
1475 
1476  replaceList = [replaceList[i] for i in xrange(len(replaceList)) if 1 << i & self.tagReduceMask]
1477 
1478 # if " " not in replaceList:
1479 # replaceList.append(" ")
1480 # self.logger.debug(">>> Repl list = " + str(replaceList))
1481  for elem in replaceList:
1482  # self.logger.debug(">>> Value before = " + localValue)
1483  localValue = Utils.replaceLoopValue(localValue, (elem * 2), elem)
1484  # self.logger.debug(">>> Value after = " + localValue)
1485  localValue = localValue.replace("\r", " ")
1486 
1487  if isinstance(result.tags[tagName]["data"], list) and len(result.tags[tagName]["data"]) > 0:
1488  result.tags[tagName]["data"][0] = localValue
1489  elif isinstance(result.tags[tagName]["data"], basestring):
1490  result.tags[tagName]["data"] = localValue
1491 
1492 
-mask-info
Here is the caller graph for this function:

◆ replaceLoopValue()

def dc_processor.Scraper.Scraper.replaceLoopValue (   self,
  buf,
  replaceFrom,
  replaceTo 
)

Definition at line 1439 of file Scraper.py.

1439  def replaceLoopValue(self, buf, replaceFrom, replaceTo):
1440  localValue = buf
1441  replaceValue = localValue.replace(replaceFrom, replaceTo)
1442  while len(replaceValue) != len(buf):
1443  localValue = replaceValue
1444  replaceValue = localValue.replace(replaceFrom, replaceTo)
1445  return localValue
1446 
1447 
def replaceLoopValue(buf, replaceFrom, replaceTo)
Definition: Utils.py:1233

◆ run()

def dc_processor.Scraper.Scraper.run (   self)

Definition at line 174 of file Scraper.py.

174  def run(self):
175  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
176  # call base class run method
177  foundation.CementApp.run(self)
178 
179  # config section
180  self.loadConfig()
181 
182  # load logger config file
183  self.loadLogConfigFile()
184 
185  # options
186  self.loadOptions()
187 
188  # scraper properties
189  self.loadScraperProperties()
190 
191  # Do applied algorithm's job
192  self.processBatch()
193 
194  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
195  # Finish logging
196  self.logger.info(APP_CONSTS.LOGGER_DELIMITER_LINE)
197 
198 
Here is the call graph for this function:

◆ setup()

def dc_processor.Scraper.Scraper.setup (   self)

Definition at line 166 of file Scraper.py.

166  def setup(self):
167  if self.usageModel == APP_CONSTS.APP_USAGE_MODEL_PROCESS:
168  # call base class setup method
169  foundation.CementApp.setup(self)
170 
171 

◆ splitMediaTagString()

def dc_processor.Scraper.Scraper.splitMediaTagString (   self,
  urlStringMedia 
)

Definition at line 2132 of file Scraper.py.

2132  def splitMediaTagString(self, urlStringMedia):
2133  # variable for result
2134  urls = []
2135  PROTOCOL_STR = 'http'
2136  DELIMITER_OLD = ','
2137  DELIMITER_NEW = '|||||'
2138  urlStringMedia = urlStringMedia.replace(DELIMITER_OLD + PROTOCOL_STR, DELIMITER_NEW + PROTOCOL_STR)
2139  # temporary string for replace in url string
2140  REPLACE_STR = 'base64|'
2141  if urlStringMedia.find(MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR) > -1:
2142  urlStringMedia = urlStringMedia.replace(MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR, REPLACE_STR)
2143  urls = urlStringMedia.split(DELIMITER_NEW)
2144  self.logger.debug("!!! urls before: " + varDump(urls))
2145  urls = [url.replace(REPLACE_STR, MediaLimitsHandler.BINARY_IMAGE_SEARCH_STR) for url in urls]
2146  self.logger.debug("!!! urls after: " + varDump(urls))
2147  else:
2148  urls = urlStringMedia.split(DELIMITER_NEW)
2149 
2150  return urls
2151 
2152 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ templateExtraction()

def dc_processor.Scraper.Scraper.templateExtraction (   self,
  config,
  urlHost 
)

Definition at line 914 of file Scraper.py.

914  def templateExtraction(self, config, urlHost):
915  resultsList = []
916 
917  self.extractor = ScrapyExtractor(config, self.input_data.template, urlHost)
918  result = Result(None, self.input_data.urlId, self.metrics)
919  sel = SelectorWrapper(text=self.input_data.raw_content)
920  template = self.getTemplate()
921  for tag in template:
922  self.logger.debug("Template tag: " + tag)
923  if "state" in template[tag] and not bool(int(template[tag]["state"])):
924  self.logger.debug("Tag skipped because state disabled, name: %s", str(tag))
925  continue
926  xPathPreparing = TemplateExtractorXPathPreparing(self.properties[CONSTS.TAG_MARKUP_PROP_NAME] \
927  if CONSTS.TAG_MARKUP_PROP_NAME in self.properties else None)
928  for rule in template[tag]:
929  if not isinstance(rule, dict):
930  self.logger.error("Rule skipped because wrong structure - is not dict() type: %s", str(type(rule)))
931  continue
932  if "attributesExclude" in rule:
933  try:
934  if rule["attributesExclude"] != "":
935  self.attrConditions = json.loads(rule["attributesExclude"])
936  except Exception as err:
937  self.logger.error("Feature of attributesExclude ignored because wrong structure: %s", str(err))
938  self.attrConditions = None
939  else:
940  self.attrConditions = None
941  xPathPreparing.attrConditions = self.attrConditions
942  pathDict = Utils.getPairsDicts(rule)
943  isExtract = True
944  localResult = Result(None, self.input_data.urlId, self.metrics)
945  # Added new template format conversion
946  xpath = None
947  xpathValue = None
948  self.logger.debug(">>> self.properties: " + varDump(self.properties))
949  # Added new template type specification
950  self.xpathSplitString = xPathPreparing.resolveDelimiter(rule, self.properties, self.xpathSplitString)
951  innerDelimiter = xPathPreparing.resolveInnerDelimiter(rule, self.properties)
952  self.logger.debug(">>> xpathSplitString: '" + str(self.xpathSplitString) + "'")
953  self.logger.debug(">>> innerDelimiter: '" + str(innerDelimiter) + "'")
954  try:
955  xpath, xpathValue = xPathPreparing.process(rule, sel, self.xpathSplitString, innerDelimiter)
956  except Exception as excp:
957  ExceptionLog.handler(self.logger, excp, "Rule/xpath exception: ", (), \
958  {ExceptionLog.LEVEL_NAME_ERROR:ExceptionLog.LEVEL_VALUE_DEBUG})
959  continue
960  self.logger.debug("xpath: `%s`, xpathType: `%s`, xpathValue: `%s`",
961  str(xpath), str(type(xpathValue)), str(xpathValue))
962  if (isinstance(xpathValue, list) and len(xpathValue) == 0) or\
963  (isinstance(xpathValue, basestring) and xpathValue == ''):
964  self.logger.debug(">>> set default xpathValue")
965  xpathValue = []
966  xpathValue.append(rule["default"])
967  isExtract = False
968  self.logger.debug("result before:\n%s", varDump(localResult))
969  self.extractor.addTag(localResult, tag, xpathValue, xpath, not isExtract, False, rule["type"])
970  self.logger.debug("result after:\n%s", varDump(localResult))
971 
972  self.logger.debug("Tag type: `%s`, tags data type: `%s`",
973  str(type(localResult.tags)), str(type(localResult.tags[tag]["data"])))
974  if tag in localResult.tags and isinstance(localResult.tags[tag]["data"], basestring):
975  self.logger.debug("Convert result for tag: `%s`", str(tag))
976  localString = localResult.tags[tag]["data"]
977  localResult.tags[tag]["data"] = []
978  localResult.tags[tag]["data"].append(localString)
979 
980  self.formatTag(localResult, rule, tag, pathDict, isExtract)
981 
982  if isExtract:
983  self.postprocessing(localResult, rule, tag)
984 
985  localResult.finish = time.time()
986 
987  resultsList.append({"obj": localResult, "join": rule["join"], "isExtract": isExtract, "mandatory":
988  (bool(rule["mandatory"]) if "mandatory" in rule else False),
989  "delimiter": (rule["delimiter"] if "delimiter" in rule else self.xpathSplitString),
990  "type": rule["type"]})
991 
992  prepareResultsList = self.prepareResults(resultsList)
993  self.compileResults(result, prepareResultsList, tag, xPathPreparing)
994  resultsList = []
995  result.finish = time.time()
996 
997  return [result]
998 
999 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
-mask-info
def templateExtraction(self, config, urlHost)
Here is the call graph for this function:
Here is the caller graph for this function:

Member Data Documentation

◆ algorithm_name

dc_processor.Scraper.Scraper.algorithm_name

Definition at line 135 of file Scraper.py.

◆ altTagsMask

dc_processor.Scraper.Scraper.altTagsMask

Definition at line 143 of file Scraper.py.

◆ article

dc_processor.Scraper.Scraper.article

Definition at line 139 of file Scraper.py.

◆ attrConditions

dc_processor.Scraper.Scraper.attrConditions

Definition at line 156 of file Scraper.py.

◆ baseUrl

dc_processor.Scraper.Scraper.baseUrl

Definition at line 161 of file Scraper.py.

◆ config

dc_processor.Scraper.Scraper.config

Definition at line 1701 of file Scraper.py.

◆ configFile

dc_processor.Scraper.Scraper.configFile

Definition at line 148 of file Scraper.py.

◆ datetimeNewsNames

dc_processor.Scraper.Scraper.datetimeNewsNames

Definition at line 153 of file Scraper.py.

◆ datetimeTemplateTypes

dc_processor.Scraper.Scraper.datetimeTemplateTypes

Definition at line 154 of file Scraper.py.

◆ dbWrapper

dc_processor.Scraper.Scraper.dbWrapper

Definition at line 157 of file Scraper.py.

◆ entry

dc_processor.Scraper.Scraper.entry

Definition at line 138 of file Scraper.py.

◆ errorMask

dc_processor.Scraper.Scraper.errorMask

Definition at line 141 of file Scraper.py.

◆ exitCode

dc_processor.Scraper.Scraper.exitCode

Definition at line 126 of file Scraper.py.

◆ extractor

dc_processor.Scraper.Scraper.extractor

Definition at line 128 of file Scraper.py.

◆ extractors

dc_processor.Scraper.Scraper.extractors

Definition at line 129 of file Scraper.py.

◆ input_data

dc_processor.Scraper.Scraper.input_data

Definition at line 130 of file Scraper.py.

◆ itr

dc_processor.Scraper.Scraper.itr

Definition at line 127 of file Scraper.py.

◆ logger

dc_processor.Scraper.Scraper.logger

Definition at line 131 of file Scraper.py.

◆ mediaLimitsHandler

dc_processor.Scraper.Scraper.mediaLimitsHandler

Definition at line 158 of file Scraper.py.

◆ message_queue

dc_processor.Scraper.Scraper.message_queue

Definition at line 137 of file Scraper.py.

◆ metrics

dc_processor.Scraper.Scraper.metrics

Definition at line 142 of file Scraper.py.

◆ MSG_ERROR_WRONG_CONFIG_FILE_NAME

string dc_processor.Scraper.Scraper.MSG_ERROR_WRONG_CONFIG_FILE_NAME = "Config file name is wrong"
static

Definition at line 108 of file Scraper.py.

◆ output_data

dc_processor.Scraper.Scraper.output_data

Definition at line 149 of file Scraper.py.

◆ outputFormat

dc_processor.Scraper.Scraper.outputFormat

Definition at line 140 of file Scraper.py.

◆ processedContent

dc_processor.Scraper.Scraper.processedContent

Definition at line 146 of file Scraper.py.

◆ properties

dc_processor.Scraper.Scraper.properties

Definition at line 134 of file Scraper.py.

◆ pubdate

dc_processor.Scraper.Scraper.pubdate

response.tagsLangDetecting(self.properties[CONSTS.LANG_PROP_NAME])

Definition at line 136 of file Scraper.py.

◆ scraperPropFileName

dc_processor.Scraper.Scraper.scraperPropFileName

Definition at line 133 of file Scraper.py.

◆ sqliteTimeout

dc_processor.Scraper.Scraper.sqliteTimeout

Definition at line 132 of file Scraper.py.

◆ tagReduceMask

dc_processor.Scraper.Scraper.tagReduceMask

Definition at line 160 of file Scraper.py.

◆ tagsCount

dc_processor.Scraper.Scraper.tagsCount

Definition at line 144 of file Scraper.py.

◆ tagsMask

dc_processor.Scraper.Scraper.tagsMask

Definition at line 145 of file Scraper.py.

◆ tagsTypes

dc_processor.Scraper.Scraper.tagsTypes

Definition at line 155 of file Scraper.py.

◆ urlHost

dc_processor.Scraper.Scraper.urlHost

Definition at line 150 of file Scraper.py.

◆ urlSourcesRules

dc_processor.Scraper.Scraper.urlSourcesRules

Definition at line 159 of file Scraper.py.

◆ usageModel

dc_processor.Scraper.Scraper.usageModel

Definition at line 147 of file Scraper.py.

◆ useCurrentYear

dc_processor.Scraper.Scraper.useCurrentYear

Definition at line 152 of file Scraper.py.

◆ WWW_PREFIX

string dc_processor.Scraper.Scraper.WWW_PREFIX = "www."
static

Definition at line 110 of file Scraper.py.

◆ xpathSplitString

dc_processor.Scraper.Scraper.xpathSplitString

Definition at line 151 of file Scraper.py.


The documentation for this class was generated from the following file: