HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
CollectURLs.py
Go to the documentation of this file.
1 """
2 @package: dc
3 @file CollectURLs.py
4 @author Scorp <developers.hce@gmail.com>
5 @link: http://hierarchical-cluster-engine.com/
6 @copyright: Copyright &copy; 2013-2014 IOIX Ukraine
7 @license: http://hierarchical-cluster-engine.com/license/
8 @since: 0.1
9 """
10 
11 import copy
12 import json
13 import hashlib
14 import re
15 import lxml.html
16 try:
17  import feedparser
18 except ImportError:
19  feedparser = None
20 
21 import app.Consts as APP_CONSTS
22 from app.Filters import Filters
23 # from app.Utils import UrlNormalizator
24 from app.Utils import urlNormalization
25 from app.UrlNormalize import UrlNormalize
26 from app.Utils import varDump
27 import app.Utils as Utils # pylint: disable=F0401
28 import dc_processor.Constants as PCONSTS
29 import dc.EventObjects
30 from dc_crawler.Fetcher import SeleniumFetcher
31 import dc_crawler.Constants as CRAWLER_CONSTS
32 from dc_crawler.HTTPRedirectResolver import HTTPRedirectResolver
33 
34 
35 logger = Utils.MPLogger().getLogger()
36 
37 
38 class CollectURLs(object):
39 
40  BINARY_CONTENT_TYPE_PATTERN = re.compile('(text)|(xml)', re.I)
41  COLLECT_POST_DATA_NAME = "COLLECT_POST_DATA"
42  COLLECT_POST_DATA = "1"
43  DC_URLS_TABLE_PREFIX = "urls_"
44  PATTERN_WITH_PROTOCOL = re.compile('[a-zA-Z]+:(//)?')
45  DETECT_MIME_MAIN_CONTENT = "1"
46  DETECT_MIME_COLLECTED_URL = "2"
47  DETECT_MIME_TIMEOUT = 1
48 
49 
50  def __init__(self, isAbortedByTTL=None):
51  self.crawledResource = None
52  self.url = None
53  self.dom = None
54  self.realUrl = None
55  self.baseUrl = None
56  self.processorName = None
57  self.batchItem = None
58  self.urlXpathList = None
59  self.feedItems = None
60  self.feed = None
61  self.siteProperties = None
62  self.site = None
63  self.dbWrapper = None
64  self.autoRemoveProps = None
65  self.autoDetectMime = None
67  self.postForms = None
68  self.urlProcess = None
69  self.robotsParser = None
70  self.urlsXpathList = []
71  self.isAbortedByTTL = (lambda: False) if isAbortedByTTL is None else isAbortedByTTL
72 
73 
74  # #checkFieldsIsNone method checks all class's mandatory fields
75  #
76  def checkFieldsIsNone(self):
77  excludeList = ['feedItems', 'feed', 'processorName', 'autoDetectMime', 'processContentTypes',
78  'postForms', 'robotsParser', 'dom', 'dbWrapper']
79  for field in self.__dict__:
80  if field not in excludeList and (not hasattr(self, field) or getattr(self, field) is None):
81  msg = "Mandatory field must be initialized, field Name = " + field
82  logger.error(msg)
83  raise Exception(msg)
84 
85 
86  # # process method
87  #
88  # @param httpCode - http code
89  # @param readOnly - boolean flag read only
90  # @param httpApplyHeaders - http headers for apply
91  # @param proxyName - proxy name
92  def process(self, httpCode, readOnly=False, httpApplyHeaders=None, proxyName=None):
93 
94  self.checkFieldsIsNone()
95  if self.siteProperties is None:
96  self.siteProperties = {}
97  if self.processContentTypes is None:
98  self.processContentTypes = []
99  localSiteId = self.batchItem.siteId if self.batchItem.siteId else "0"
100  nextStep = True
101  useChains = False
102  internalLinks, externalLinks = [], []
103  maxURLsFromPage = 0
104  params = []
105  chainUrls = []
106  formUrls = None
107  formMethods = None
108  formFields = None
109  urlSet = set()
110 
111  logger.debug("!!! self.site.maxURLsFromPage = " + str(self.site.maxURLsFromPage))
112  logger.debug("!!! self.url.maxURLsFromPage = " + str(self.url.maxURLsFromPage))
113 
114  if self.site is not None and self.site.maxURLsFromPage is not None:
115  maxURLsFromPage = self.site.maxURLsFromPage
116 
117  if self.url is not None and self.url.maxURLsFromPage is not None and self.url.maxURLsFromPage > 0:
118  maxURLsFromPage = self.url.maxURLsFromPage
119 
120  if nextStep and self.crawledResource is not None and \
121  not self.BINARY_CONTENT_TYPE_PATTERN.search(self.crawledResource.content_type):
122  nextStep = False
123 
124  # don't parse url for 4XX or 5XX response
125  if nextStep and self.crawledResource is not None:
126  code_type = int(self.crawledResource.http_code) / 100
127  if code_type == 4 or code_type == 5:
128  nextStep = False
129 
130  if nextStep and self.crawledResource is not None and not self.crawledResource.html_content:
131  nextStep = False
132 
133  # if nextStep and self.dom is None:
134  # logger.debug("DOM is None")
135  # nextStep = False
136 
137  if nextStep:
138  useChains = True
139  if self.dom is not None:
140  self.processProcessor(urlSet, self.dom, self.urlXpathList, self.batchItem.urlObj)
141  formUrls, formMethods, formFields = self.extractFormURL(self.dom, self.siteProperties)
142  urlSet.update(formUrls)
143  else:
144  logger.debug("DOM is None")
145 
146  if self.url.type == dc.EventObjects.URL.TYPE_SINGLE:
147  logger.debug("URL type: single")
148  nextStep = False
149 
150  if nextStep and self.crawledResource.dynamic_fetcher_result_type == \
151  SeleniumFetcher.MACRO_RESULT_TYPE_URLS_LIST:
152  ul = None
153  try:
154  ul = json.loads(self.crawledResource.html_content)
155  except Exception, err:
156  logger.error("Error deserialize macro data from result string: %s\n%s", str(err),
157  self.crawledResource.html_content)
158  if ul is not None:
159  logger.debug("Fill urlSet from macro results: %s items", str(len(ul)))
160  if isinstance(ul, list):
161  urlSet.update([u for u in ul if isinstance(u, basestring) and u != ''])
162 
163  if nextStep:
164  if self.url.type == dc.EventObjects.URL.TYPE_CHAIN:
165  logger.debug("URL type: chain")
166  nextStep = False
167 
168  if nextStep:
169  # (3) END
170  urlTable = self.DC_URLS_TABLE_PREFIX + localSiteId
171  self.urlProcess.urlTable = urlTable
172  try:
173  if self.siteProperties is not None and "RSS_FEED_ZERO_ITEM" in self.siteProperties and \
174  int(self.siteProperties["RSS_FEED_ZERO_ITEM"]) == 1:
175  if self.processorName == PCONSTS.PROCESSOR_FEED_PARSER or self.processorName == PCONSTS.PROCESSOR_RSS:
176  self.feedElementsProcessing(self.url.urlMd5, httpCode, self.url.url, localSiteId, self.url, self.url.url,
177  params, maxURLsFromPage, True)
178  except ValueError:
179  logger.debug(">>> Wrong \"RSS_FEED_ZERO_ITEM\" property's value")
180 
181  logger.debug("URLs candidates collected %s items:\n%s", str(len(urlSet)), str(urlSet))
182 
183  if self.site.maxURLs > 0 and len(urlSet) >= self.site.maxURLs:
184  urlSet = set(list(urlSet)[:self.site.maxURLs])
185  logger.debug("Site maxURLs = %s limit reached.", str(self.site.maxURLs))
186 
187  if self.site.maxResources > 0 and len(urlSet) >= self.site.maxResources:
188  urlSet = set(list(urlSet)[:self.site.maxResources])
189  logger.debug("Site maxResources = %s limit reached.", str(self.site.maxResources))
190 
191  countCnt = 0
192  countErrors = 0
193  for elemUrl in urlSet:
194 
195  if self.isAbortedByTTL():
196  logger.debug("Aborted by TTL. All elements skipped.")
197  break
198 
199  if elemUrl is None:
200  logger.debug("Some url from urlSet is None, skipped.")
201  continue
202 
203  elemUrl = elemUrl.strip()
204  if elemUrl == '':
205  logger.debug("Some url from urlSet is empty, skipped!")
206  continue
207 
208  localUrl = elemUrl
209  self.urlProcess.urlObj = self.url
210  self.urlProcess.url = elemUrl
211  self.urlProcess.dbWrapper = self.dbWrapper
212  self.urlProcess.siteId = localSiteId
213  retUrl, retContinue = self.urlProcess.processURL(self.realUrl, internalLinks, externalLinks, self.filtersApply,
214  None, self.baseUrl)
215  if retUrl is not None:
216  elemUrl = retUrl
217  elemUrl = UrlNormalize.execute(siteProperties=self.siteProperties, base=self.baseUrl, url=elemUrl, supportProtocols=None, log=logger)
218  else:
219  retContinue = True
220 
221  if retContinue:
222  logger.debug("Candidate URL is not passed general checks, skipped: %s", str(elemUrl))
223  continue
224 
225  # Apply filter to Url
226  if not self.filtersApply(inputFilters=self.site.filters,
227  subject=elemUrl,
228  depth=self.url.depth,
229  wrapper=self.dbWrapper,
230  siteId=localSiteId):
231  logger.debug("Candidate URL not matched filters, skipped.")
232  continue
233  else:
234  logger.debug("Candidate URL matched filters.")
235 
236  # Check exist of the Url
237  urlMd5 = hashlib.md5(elemUrl).hexdigest()
238  self.urlProcess.url = elemUrl
239  self.urlProcess.siteId = localSiteId
240  self.urlProcess.urlTable = urlTable
241  if self.urlProcess.isUrlExist(self.site.recrawlPeriod, urlMd5):
242  logger.debug("Candidate URL %s already exist, skipped.", str(urlMd5))
243  continue
244 
245  if self.site.maxURLs > 0:
246  if httpCode == CRAWLER_CONSTS.HTTP_CODE_200:
247  countCnt += 1
248  else:
249  countErrors += 1
250 
251  if self.dbWrapper is not None:
252  currentCnt = self.urlProcess.readCurrentCnt(self.site.maxURLs)
253  if currentCnt >= self.site.maxURLs or countCnt >= self.site.maxURLs or \
254  (countCnt + countErrors) >= self.site.maxURLs:
255  logger.debug("Site MaxURLs: %s limit is reached. countCnt = %s, currentCnt = %s",
256  str(self.site.maxURLs), str(countCnt), str(currentCnt))
257  autoremovedURLs = self.urlProcess.autoRemoveURL(self.autoRemoveProps, self.site.recrawlPeriod, urlTable,
258  self.dbWrapper)
259  if autoremovedURLs == 0:
260  logger.debug("No one URL auto removed, candidate URL skipped!")
261  continue
262  else:
263  logger.debug("%s URLs auto removed.", str(autoremovedURLs))
264 
265  if currentCnt >= self.site.maxResources or countCnt >= self.site.maxResources or \
266  (countCnt + countErrors) >= self.site.maxResources:
267  logger.debug("Site maxResources = %s limit is reached. countCnt = %s, currentCnt = %s",
268  str(self.site.maxResources), str(countCnt), str(currentCnt))
269  autoremovedURLs = self.urlProcess.autoRemoveURL(self.autoRemoveProps, self.site.recrawlPeriod, urlTable,
270  self.dbWrapper)
271  if autoremovedURLs == 0:
272  logger.debug("No one URL auto removed, candidate URL skipped!")
273  continue
274  else:
275  logger.debug("%s URLs auto removed.", str(autoremovedURLs))
276 
277  # detect collected url mime type and ignore non-match URL
278  # (7) Detect collected url mime type and ignore non-match URL
279  detectedMime = ''
280  if self.autoDetectMime == self.DETECT_MIME_COLLECTED_URL and self.processContentTypes is not None:
281  self.urlProcess.url = elemUrl
282  detectedMime = self.urlProcess.detectUrlMime(self.siteProperties["CONTENT_TYPE_MAP"] if \
283  "CONTENT_TYPE_MAP" in self.siteProperties else None)
284  if detectedMime not in self.processContentTypes:
285  logger.debug("Candidate URL MIME type is not matched, skipped!")
286  continue
287  # (7) END
288 
289  if "ROBOTS_COLLECT" not in self.siteProperties or int(self.siteProperties["ROBOTS_COLLECT"]) > 0:
290  logger.debug("Robots.txt obey mode is ON")
291  if self.robotsParser and self.robotsParser.loadRobots(elemUrl, self.batchItem.siteId, httpApplyHeaders,
292  proxyName):
293  isAllowed, retUserAgent = self.robotsParser.checkUrlByRobots(elemUrl, self.batchItem.siteId,
294  httpApplyHeaders)
295  if not isAllowed:
296  logger.debug("URL " + elemUrl + " is NOT Allowed by user-agent:" + str(retUserAgent))
297  self.urlProcess.updateURLForFailed(APP_CONSTS.ERROR_ROBOTS_NOT_ALLOW, self.batchItem)
298  continue
299 
300  self.urlProcess.siteId = localSiteId
301  depth = self.urlProcess.getDepthFromUrl(self.batchItem.urlId)
302 
303  # per project redirects resolving
304  if "HTTP_REDIRECT_RESOLVER" in self.siteProperties and self.siteProperties["HTTP_REDIRECT_RESOLVER"] != "":
305  logger.debug('!!!!!! HTTP_REDIRECT_RESOLVER !!!!! ')
306 
307 
308  if "CONNECTION_TIMEOUT" in self.siteProperties:
309  connectionTimeout = float(self.siteProperties["CONNECTION_TIMEOUT"])
310  else:
311  connectionTimeout = CRAWLER_CONSTS.CONNECTION_TIMEOUT
312 
313  tm = int(self.url.httpTimeout) / 1000.0
314  if isinstance(self.url.httpTimeout, float):
315  tm += float('0' + str(self.url.httpTimeout).strip()[str(self.url.httpTimeout).strip().find('.'):])
316 
317  proxies = {"http": "http://" + proxyName} if proxyName is not None else None
318 
319  auth = None
320  if 'HTTP_AUTH_NAME' in self.siteProperties and 'HTTP_AUTH_PWD' in self.siteProperties:
321  authName = self.siteProperties['HTTP_AUTH_NAME']
322  authPwd = self.siteProperties['HTTP_AUTH_PWD']
323  if authName is not None and authPwd is not None:
324  auth = (authName, authPwd)
325 
326  postForms = {}
327  for key in self.siteProperties.keys():
328  if key.startswith('HTTP_POST_FORM_'):
329  postForms[key[len('HTTP_POST_FORM_'):]] = self.siteProperties[key]
330  postData = self.urlProcess.resolveHTTP(postForms, httpApplyHeaders)
331 
332  maxRedirects = HTTPRedirectResolver.RedirectProperty.DEFAULT_VALUE_MAX_REDIRECTS
333  if 'HTTP_REDIRECTS_MAX' in self.siteProperties:
334  maxRedirects = int(self.siteProperties['HTTP_REDIRECTS_MAX'])
335 
336  redirectResolver = HTTPRedirectResolver(propertyString=self.siteProperties["HTTP_REDIRECT_RESOLVER"],
337  fetchType=self.site.fetchType,
338  dbWrapper=self.dbWrapper,
339  siteId=localSiteId,
340  connectionTimeout=connectionTimeout)
341 
342  resUrl = redirectResolver.resolveRedirectUrl(url=elemUrl,
343  headers=httpApplyHeaders,
344  timeout=tm,
345  allowRedirects=True,
346  proxies=proxies,
347  auth=auth,
348  postData=postData,
349  maxRedirects=maxRedirects,
350  filters=self.site.filters)
351 
352  logger.debug("Resolved url: %s", str(resUrl))
353  elemUrl = resUrl
354 
355  if elemUrl is not None:
356  self.urlProcess.url = elemUrl
357  self.urlProcess.siteId = localSiteId
358  self.urlProcess.urlObj = self.url
359  localUrlObj = self.urlProcess.createUrlObjForCollectURLs(urlMd5, formMethods, self.batchItem.urlId, depth,
360  detectedMime, self.site.maxURLsFromPage)
361  # update counters of external and internal links
362  localUrlObj.linksI = len(internalLinks)
363  localUrlObj.linksE = len(externalLinks)
364 
365  if self.processorName == PCONSTS.PROCESSOR_FEED_PARSER or self.processorName == PCONSTS.PROCESSOR_RSS:
366  self.feedElementsProcessing(urlMd5, httpCode, elemUrl, localSiteId, localUrlObj, localUrl, params,
367  maxURLsFromPage)
368  else:
369  params.append(localUrlObj)
370 
371  if useChains and "URL_CHAIN" in self.siteProperties and self.siteProperties["URL_CHAIN"] is not None:
372  localChainDict = json.loads(self.siteProperties["URL_CHAIN"])
373  depth = self.urlProcess.getDepthFromUrl(self.batchItem.urlId)
374  if "url_pattern" in localChainDict:
375  for elemUrl in urlSet:
376  if elemUrl is None:
377  logger.debug("Some url from urlSet is None")
378  continue
379  self.urlProcess.url = elemUrl
380 # retUrl = self.urlProcess.simpleURLCanonize(self.realUrl)
381 # if retUrl is None or not UrlNormalizator.isNormalUrl(retUrl):
382 
383  retUrl = urlNormalization(self.baseUrl, elemUrl)
384  if retUrl is None:
385  logger.debug("Bad url normalization, url: %s", retUrl)
386  continue
387  else:
388  elemUrl = retUrl
389  detectedMime = ''
390  if self.autoDetectMime == self.DETECT_MIME_COLLECTED_URL and self.processContentTypes is not None:
391  self.urlProcess.url = elemUrl
392  detectedMime = self.urlProcess.detectUrlMime(self.siteProperties["CONTENT_TYPE_MAP"] if \
393  "CONTENT_TYPE_MAP" in self.siteProperties else None, \
394  self.batchItem.urlObj)
395  urlMd5 = hashlib.md5(elemUrl).hexdigest()
396  self.urlProcess.url = elemUrl
397  self.urlProcess.siteId = localSiteId
398  self.urlProcess.urlObj = self.url
399  try:
400  localUrlObj = self.urlProcess.\
401  createUrlObjForChain(localChainDict["url_pattern"], urlMd5, formMethods,
402  self.batchItem.urlId, depth, detectedMime, self.site.maxURLsFromPage)
403  if localUrlObj is not None:
404  chainUrls.append(copy.deepcopy(localUrlObj))
405  except Exception as excp:
406  logger.error("Error in URL_CHAIN deserialize, excp = " + str(excp))
407  if len(urlSet) > 0 and len(params) == 0:
408  logger.debug("Zero urls are collected for len(urlSet): %s", str(len(urlSet)))
409  elif len(params) > 0:
410  logger.debug("Collected and send to insert as new: %s", str(len(urlSet)))
411  if not readOnly:
412 
413  if self.dbWrapper is not None:
414  self.dbWrapper.urlNew(params)
415  self.dbWrapper.urlNew(chainUrls)
416  self.urlProcess.updateTypeForURLObjects(chainUrls)
417  self.dbWrapper.collectedURLsRecalculating(localSiteId)
418 
419  if formFields is not None and self.postForms is not None and self.dbWrapper is not None:
420  fieldParams = self.getFieldParams(formFields, self.postForms, localSiteId)
421  self.insertNewSiteProperties(fieldParams, self.dbWrapper, localSiteId)
422 
423  # logger.debug("Return from collectURLs:\n%s\n%s\n%s\n%s\n%s\n%s", str(nextStep), str(internalLinks),
424  # str(externalLinks), str(params), str(self.feedItems), str(chainUrls))
425 
426  return nextStep, internalLinks, externalLinks, params, self.feedItems, chainUrls
427 
428 
429  # # feedElementsProcessing processed rss element
430  #
431  # @param urlMd5 - element url's urlMd5
432  # @param httpCode - http code of http response
433  # @param elemUrl - element url's url
434  # @param localSiteId - siteId
435  # @param localUrlObj - element's urlObj
436  # @param localUrl - root element's url
437  # @param params - element container (List type)
438  # @param maxURLsFromPage - max URLs from page
439  # @param rootFeed - bool param that specifyied use feed element from rss struture of entities element
440  def feedElementsProcessing(self, urlMd5, httpCode, elemUrl, localSiteId, localUrlObj, localUrl, params,
441  maxURLsFromPage, rootFeed=False):
442 
443  if maxURLsFromPage > 0 and len(self.feedItems) >= maxURLsFromPage:
444  logger.debug("Site maxURLsFromPage = %s limit reached on %s number.",
445  str(maxURLsFromPage), str(len(self.feedItems)))
446  else:
447  if self.feed is not None:
448  self.urlProcess.url = elemUrl
449  self.urlProcess.siteId = localSiteId
450  self.urlProcess.urlObj = localUrlObj
451  localRet = self.urlProcess.fillRssFieldInUrlObj(localUrl, self.url.url, self.batchItem, self.processorName,
452  self.feed, rootFeed)
453  self.urlProcess.urlObj = None
454  if localRet is not None:
455  localRet["urlMd5"] = urlMd5
456  if localRet["urlObj"] is not None:
457  localRet["urlObj"].httpCode = httpCode
458  localRet["urlObj"].processingDelay = 0
459  localRet["urlObj"].parentMd5 = self.url.urlMd5
460 
461  # logger.debug("localRet = %s", str(dict(localRet)))
462 
463  params.append(localRet["urlObj"])
464  self.feedItems.append(localRet)
465  else:
466  logger.debug("self.feed is None!")
467 
468  # # processProcessor processed element from url's dom
469  #
470  # @param urlSet - incoming url's list (links from page)
471  # @param dom - page's dom model
472  # @param urlXpathList - applying xpath
473  # @param urlObj - element of urlObject
474  def processProcessor(self, urlSet, dom, urlXpathList, urlObj):
475  if (self.processorName == PCONSTS.PROCESSOR_FEED_PARSER or self.processorName == PCONSTS.PROCESSOR_RSS) \
476  and urlObj.type != dc.EventObjects.URL.TYPE_FETCHED:
477  if feedparser is not None:
478  try:
479  self.feedItems = []
480  # Add one more date parsing handler function to fix some wrong datetime format cases; added by bgv
481  feedparser.registerDateHandler(self.feedparserParseDateFixes)
482 
483  # Remove handlers to process all tags as unknown to save their names unchanged
484  if not (self.siteProperties is not None and "RSS_FEEDPARSER_MODE" in self.siteProperties and \
485  int(self.siteProperties["RSS_FEEDPARSER_MODE"]) > 0):
486  import inspect
487  # , "_start_guid"
488  excludes = ["_start_rss", "_start_channel", "_start_feed", "_start_item", "_start_link",
489  "_start_admin_errorreportsto", "_start_admin_generatoragent", "_start_guid", "_start_id",
490  "_start_entry", "_start_enclosure"]
491  for methodName, functionObject in inspect.getmembers(feedparser._FeedParserMixin, predicate=inspect.ismethod): # pylint: disable=W0612,W0212,C0301
492  if methodName.startswith("_start_") and methodName not in excludes:
493  delattr(feedparser._FeedParserMixin, methodName) # pylint: disable=W0212
494  endMethodName = methodName.replace("_start_", "_end_")
495  if hasattr(feedparser._FeedParserMixin, endMethodName): # pylint: disable=W0212
496  delattr(feedparser._FeedParserMixin, endMethodName) # pylint: disable=W0212
497  setattr(feedparser._FeedParserMixin, "_normalize_attributes", self._normalize_attributes) # pylint: disable=W0212
498  feedparser.FeedParserDict.keymap["guid"] = "guid"
499  logger.debug("Feedparser in modified mode")
500  else:
501  logger.debug("Feedparser in native mode")
502 
503  self.feed = feedparser.parse(self.crawledResource.html_content)
504  urlSet.update(entry.link for entry in self.feed.entries)
505  # logger.debug("feed.entries: %s for url: %s\nfeed=\n%s\nurlSet:\n%s", str(len(self.feed.entries)),
506  # str(urlObj.url), str(dict(self.feed)), str(urlSet))
507  if len(self.feed.entries) == 0:
508  logger.debug("Zero entries in feed, self.crawledResource:\n%s", varDump(self.crawledResource))
509  # logger.debug("self.processContentTypes: %s", str(self.processContentTypes))
510  logger.debug("self.crawledResource.content_type = %s", str(self.crawledResource.content_type))
511  if self.crawledResource.content_type == dc.EventObjects.URL.CONTENT_TYPE_TEXT_HTML:
512  urlObj.errorMask |= APP_CONSTS.ERROR_MASK_SITE_UNSUPPORTED_CONTENT_TYPE
513 
514  except TypeError as err:
515  logger.debug("WRONG CONTENT FOR URL <" + str(urlObj.url) + "> not rss feed. " + str(err.message))
516  except Exception as err:
517  logger.debug("SOME ERROR WITH rss feed parse " + str(err.message))
518  else:
519  logger.debug("feedparser module not found")
520  # won't collect urls from rss feed resources
521  elif self.processorName != PCONSTS.PROCESSOR_RSS:
522  # Added support of urlXpathList as site's properties
523  if len(urlXpathList) > 0:
524  logger.debug("Site has COLLECT_URLS_XPATH_LIST property: %s", str(urlXpathList))
525  else:
526  # Set urls xpath list
527  urlXpathList = {'sets': {'': self.urlsXpathList}}
528  logger.debug("Site has no COLLECT_URLS_XPATH_LIST property, default xpath list used: %s", str(urlXpathList))
529  if 'sets' in urlXpathList and isinstance(urlXpathList['sets'], dict):
530  matchedSets = 0
531  if 'date_format' in urlXpathList:
532  dformat = str(urlXpathList['date_format'])
533  else:
534  dformat = '%Y-%m-%d %H:%M:%S'
535  for rexpr in urlXpathList['sets']:
536  if rexpr == '' or re.search(rexpr, urlObj.url) is not None:
537  if 'mode' in urlXpathList and int(urlXpathList['mode']) == 1:
538  xpathl = self.urlsXpathList + urlXpathList['sets'][rexpr]
539  else:
540  xpathl = urlXpathList['sets'][rexpr]
541  matchedSets += 1
542  for xpath in xpathl:
543  xpath = self.evaluateDateMacro(xpath, dformat)
544  elem = dom.xpath(xpath)
545  elem_type = type(elem)
546  if elem_type == list and len(elem) > 0 and hasattr(elem[0], "tail"):
547  urlSet.update([el.tail for el in elem])
548  elif elem_type == list and len(elem) > 0 and isinstance(elem[0], lxml.html.HtmlElement):
549  urlSet.update([el.text for el in elem])
550  else:
551  urlSet.update(elem)
552  if matchedSets == 0:
553  logger.debug("Warning! No one xpath set matched URL %s, URLs not collected!", urlObj.url)
554  else:
555  logger.debug('Wrong COLLECT_URLS_XPATH_LIST property, `sets` key with dict() of re->xpath_list[] expected!' + \
556  ' Collect URLs aborted!')
557  # logger.debug("urlSet: %s", str(urlSet))
558 
559 
560  # #Evaluate date macro
561  #
562  # @param localPattern - input pattern as string
563  # @param dateFromat - format for %@DATE... macro
564  # @return string with date macro evaluated and replaced with values
565  def evaluateDateMacro(self, localPattern, dateFromat):
566  import time
567  import datetime
568  try:
569  d = {'DATE':'', 'SHORTYEAR':'y', 'YEAR':'Y', 'MONTH':'m', 'DAY':'d', 'HOUR':'H', 'MINUTE':'M', 'SECOND':'S'}
570  regex = re.compile("%@(SHORTYEAR|YEAR|MONTH|DAY|HOUR|MINUTE|SECOND|DATE)\\(([\\+|\\-]\\d{1,2})\\)%")
571  matchArray = regex.findall(localPattern)
572  for i in matchArray:
573  if i[0] == 'DATE':
574  f = dateFromat
575  else:
576  f = '%' + d[i[0]]
577  t = time.strftime(f, time.gmtime(time.time() + datetime.timedelta(hours=(+int(i[1]))).seconds))
578  localPattern = localPattern.replace("%@" + i[0] + "(" + i[1] + ")%", t)
579  except Exception, err:
580  logger.error(str(err))
581 
582  return localPattern
583 
584 
585  # # extrace URL, form action, and fields from html dom
586  #
587  # @param dom the dom tree
588  # @return form_urls sequence of urls extracted
589  # @return form_methods dict of form methods, {form_action: form_method}
590  # @return form_fields dict of fields {field_name: field_value}
591  def extractFormURL(self, dom, siteProperties):
592  formUrls, formMethods, formFields = [], {}, {}
593  if self.COLLECT_POST_DATA_NAME in siteProperties and \
594  siteProperties['COLLECT_POST_DATA'] == self.COLLECT_POST_DATA:
595  for form in dom.xpath("//form"):
596  formAction = None
597  formMethod = 'get'
598  for attr in form.keys():
599  if attr.lower() == "action":
600  formAction = form.get(attr)
601  formUrls.append(formAction)
602  elif attr.lower() == "method":
603  formMethod = form.get(attr)
604  if not formAction:
605  continue
606  formMethods[formAction] = formMethod
607  for field in form.getchildren():
608  tagName, tagValue = None, ""
609  for fieldTag in field.keys():
610  if fieldTag.lower() == "name":
611  tagName = field.get(fieldTag)
612  elif fieldTag.lower() == "value":
613  tagValue = field.get(fieldTag)
614  if tagName:
615  formFields[tagName] = tagValue
616  logger.info("extracted form data, formUrls:%s, formMethods:%s, formFields:%s", \
617  formUrls, formMethods, formFields)
618  return formUrls, formMethods, formFields
619 
620 
621  # # Applys filters and returns bool result
622  #
623  # @param inputFilters - input sites filters list
624  # @param subject - subject for apply filter
625  # @param depth - depth value
626  # @param wrapper - DBTasksWrapper instance
627  # @param siteId - site ID used with db-task wrapper
628  # @param fields - dictionary values of support macro names ('PDATE' and other)
629  # @param opCode - operation code
630  # @param stage - stage of apply filter
631  # @param selectSubject - select subject use select from DB
632  # @param defaultValue - default value for result
633  # @return True if filter is good or False otherwise
634  @staticmethod
635  def filtersApply(inputFilters, subject, depth, wrapper, siteId, fields=None, opCode=Filters.OC_RE, \
636  stage=Filters.STAGE_COLLECT_URLS, selectSubject=None, defaultValue=False):
637  ret = defaultValue
638  fValue = Utils.generateReplacementDict()
639  fValue.update({"MAX_DEPTH": str(depth)})
640 
641  if inputFilters is not None:
642  for inputFilter in inputFilters:
643  if inputFilter.stage == Filters.STAGE_ALL or inputFilter.stage == Filters.STAGE_REDIRECT_URL:
644  inputFilter.stage = Filters.STAGE_COLLECT_URLS
645 
646 # logger.debug(">>> Filters() (2.1) fields: " + varDump(fields) + " inputFilters: " + varDump(inputFilters))
647  localFilters = Filters(filters=inputFilters, dbTaskWrapper=wrapper, siteId=siteId, readMode=0, fields=fields,
648  opCode=opCode, stage=stage, selectSubject=selectSubject)
649 
650  # logger.debug(">>> before filter include = " + subject[:255] + ' . . . ')
651  fResult = localFilters.filterAll(stage, fValue, Filters.LOGIC_OR, subject, 1)
652  logger.debug(">>> filter result include - " + str(fResult))
653  for elem in fResult:
654 # logger.debug('elem = ' + str(elem) + ' type: ' + str(type(elem)))
655  if elem > 0:
656  ret = True
657  break
658 
659  if ret is True:
660  # logger.debug(">>> before filter exclude = " + subject[:255] + ' . . . ')
661  fResult = localFilters.filterAll(stage, fValue, Filters.LOGIC_OR, subject, -1)
662  logger.debug(">>> filter result exclude - " + str(fResult))
663  for elem in fResult:
664 # logger.debug('elem = ' + str(elem) + ' type: ' + str(type(elem)))
665  if elem > 0:
666  ret = False
667  break
668 
669  logger.debug("Verdict: " + str(ret))
670  return ret
671 
672 
673  # #getFieldParams method fill and returns post forms
674  #
675  # formFields - post fields dict
676  # postForms - post form list
677  # siteId - site's id
678  def getFieldParams(self, formFields, postForms, siteId):
679  ret = []
680  for fieldName, fieldValue in formFields.iteritems():
681  if fieldName in postForms:
682  continue
683  logger.debug("field_name: %s", fieldName)
684  logger.debug("field_value: %s", fieldValue)
685  ret.append((siteId, "HTTP_POST_FORM_" + fieldName, fieldValue))
686  return ret
687 
688 
689  # #insertNewSiteProperties update Site (add new properties in correspond table)
690  #
691  # params - params list
692  # wrapper - db-task wrapper
693  # siteId - site's id
694  def insertNewSiteProperties(self, params, wrapper, siteId):
695  if siteId is not None and hasattr(params, '__iter__') and len(params) > 0:
696  localSiteUpdate = dc.EventObjects.SiteUpdate(siteId)
697  for attr in localSiteUpdate.__dict__:
698  if hasattr(localSiteUpdate, attr):
699  setattr(localSiteUpdate, attr, None)
700  localSiteUpdate.updateType = dc.EventObjects.SiteUpdate.UPDATE_TYPE_APPEND
701  localSiteUpdate.id = siteId
702  localSiteUpdate.properties = []
703  for param in params:
704  newPropElem = {}
705  newPropElem["siteId"] = param[0]
706  newPropElem["name"] = param[1]
707  newPropElem["value"] = param[2]
708  localSiteUpdate.properties.append(newPropElem)
709  wrapper.siteNewOrUpdate(localSiteUpdate, stype=dc.EventObjects.SiteUpdate)
710 
711 
712  # #feedparserParseDateFixes method to fix date parsing for the feedparser
713  #
714  # @param aDateString the date string to parse
715  def feedparserParseDateFixes(self, aDateString):
716  ret = None
717  ds = aDateString
718 
719  # Assumes that date format broken and contains the semicolon ":" in TZ like: "Wed, 19 Aug 2015 08:45:53 +01:00"
720  parts = ds.split(' ')
721  if ("+" in parts[len(parts) - 1] or "-" in parts[len(parts) - 1]) and ":" in parts[len(parts) - 1]:
722  parts[len(parts) - 1] = parts[len(parts) - 1].replace(":", "")
723  ds = " ".join(parts)
724  # ret = feedparser._parse_date_rfc822(ds)
725  ret = feedparser._parse_date(ds) # pylint: disable=W0212
726 
727  return ret
728 
729 
730  def _normalize_attributes(self, kv):
731  return (kv[0], kv[1])
def getFieldParams(self, formFields, postForms, siteId)
Definition: CollectURLs.py:678
def insertNewSiteProperties(self, params, wrapper, siteId)
Definition: CollectURLs.py:694
def feedElementsProcessing(self, urlMd5, httpCode, elemUrl, localSiteId, localUrlObj, localUrl, params, maxURLsFromPage, rootFeed=False)
Definition: CollectURLs.py:441
def evaluateDateMacro(self, localPattern, dateFromat)
Definition: CollectURLs.py:565
def __init__(self, isAbortedByTTL=None)
Definition: CollectURLs.py:50
def extractFormURL(self, dom, siteProperties)
Definition: CollectURLs.py:591
def filtersApply(inputFilters, subject, depth, wrapper, siteId, fields=None, opCode=Filters.OC_RE, stage=Filters.STAGE_COLLECT_URLS, selectSubject=None, defaultValue=False)
Definition: CollectURLs.py:636
def process(self, httpCode, readOnly=False, httpApplyHeaders=None, proxyName=None)
Definition: CollectURLs.py:92
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def urlNormalization(base, url, supportProtocols=None, log=None)
Definition: Utils.py:561
Definition: join.py:1
def processProcessor(self, urlSet, dom, urlXpathList, urlObj)
Definition: CollectURLs.py:474
def feedparserParseDateFixes(self, aDateString)
Definition: CollectURLs.py:715