HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
EventObjects.py
Go to the documentation of this file.
1 '''
2 HCE project, Python bindings, Distributed Crawler application.
3 Event objects definitions.
4 
5 @package: dc
6 @author bgv bgv.hce@gmail.com
7 @link: http://hierarchical-cluster-engine.com/
8 @copyright: Copyright © 2013-2014 IOIX Ukraine
9 @license: http://hierarchical-cluster-engine.com/license/
10 @since: 0.1
11 '''
12 import hashlib
13 import time
14 
15 from app.Utils import JsonSerializable
16 from app.Utils import SQLExpression
17 from app.Utils import UrlNormalizator
18 import app.Consts as APP_CONSTS
19 import app.Utils as Utils # pylint: disable=F0401
20 
21 # Logger initialization
22 logger = Utils.MPLogger().getLogger()
23 
24 DELAYED_OPERATION = 0
25 NOT_DELAYED_OPERATION = 1
26 
27 # #Site event object, defines the Site abstraction
28 #
29 # The site object used to create site representation inside DC application.
30 # This is a main data unit that is used by DC inside to operate.
32 
33  STATE_ACTIVE = 1
34  STATE_DISABLED = 2
35  STATE_SUSPENDED = 3
36  STATE_DELETED = 4
37  STATE_DELETE_TASK = 5
38  STATE_RESTART = 6
39  STATE_CLEANED = 7
40  STATE_CLEANUP_TASK = 8
41  STATE_NOT_FOUND = 9
42 
43  FETCH_TYPE_STATIC = 1
44  FETCH_TYPE_DYNAMIC = 2
45  FETCH_TYPE_AUTO = 7
46  FETCH_TYPE_EXTERNAL = 3
47 
48  # Default piority for Site
49  DEFAULT_PRIORITY = 100
50 
51  # Default categoryId
52  DEFAULT_CATEGORY_ID = 0
53 
54  # #constructor
55  # initialize fields
56  # @param url One of possible url used as root entry point of the site
57  #
58  def __init__(self, url, _userId=0):
59  super(Site, self).__init__()
60 
61  url = URL(siteId=0, url=url,
62  normalizeMask=UrlNormalizator.NORM_NONE).getURL(normalizeMask=UrlNormalizator.NORM_NONE)
63  # #@var id
64  # The site Id, calculated as md5 hash from first root url
65  self.id = hashlib.md5(url).hexdigest()
66  self.uDate = None
67  self.tcDate = None
68  self.cDate = SQLExpression("NOW()")
69  self.resources = 0
70  self.contents = 0
71  self.iterations = 0
72  # #@var state
73  # The site state
74  self.state = self.STATE_ACTIVE
75  # #@var priority
76  # The site priority, low value means low priority
78  # The max URLs number that can be collected for site, 0 - means unlimited
79  self.maxURLs = 0
80  # The max resources number that can be collected for site, 0 - means unlimited
81  self.maxResources = 0
82  # The max crawling error number, 0 - means unlimited
83  self.maxErrors = 0
84  # The max resource size, byte
85  self.maxResourceSize = 0
86  # HTTP request delay, msec
87  self.requestDelay = 500
88  # Content processing delay, msec
89  self.processingDelay = 500
90  # HTTP response timeout, msec
91  self.httpTimeout = 30000
92  # Error mask bit set, see detailed specification
93  self.errorMask = 0
94  # Errors counter
95  self.errors = 0
96  # the sum of all raw content files sizes of resources crawled
97  self.size = 0
98  # AVG bytes per second (BPS) rate
99  self.avgSpeed = 0
100  # total times of claculating avg speed
102  # URL type
103  # 0 - Regular, collect URLs and insert only for this site according filters;
104  # 1 - Single, do not collect URLs,
105  # 3 - collect URLs, create sites and insert for all
106  self.urlType = 0
107  # #@var description
108  # The site description.
109  self.description = ""
110  # #@var urls
111  # The list of urls strings used as root entry points
112  self.urls = []
113  if url is not None and len(url) > 0:
114  localUrl = SiteURL(siteId=self.id, url=url, normalizeMask=UrlNormalizator.NORM_NONE)
115  self.urls.append(localUrl)
116  # #@var filters
117  # The list of url filters object SiteFilter
118  self.filters = [SiteFilter(self.id, "(.*)")]
119  # #@var properties
120  # The dic of site properties fields
121  self.properties = [{"name": "PROCESS_CTYPES", "value": "text/html"},
122  {"name": "STORE_HTTP_REQUEST", "value": "1"},
123  {"name": "STORE_HTTP_HEADERS", "value": "1"},
124  {"name": "HTTP_HEADERS", "value": ""},
125  {"name": "HTTP_COOKIE", "value": ""}]
126  self.userId = _userId
127  # #@var recrawlPeriod
128  self.recrawlPeriod = 0
129  # #@var recrawlDate
130  self.recrawlDate = None
131  # #@var maxURLsFromPage
133  # #@var collectedURLs
134  self.collectedURLs = 0
135  # #@var fetchType
137  # #@var newURLs
138  self.newURLs = 0
139  # #@var deletedURLs
140  self.deletedURLs = 0
141  # #@var moveURLs
142  self.moveURLs = True
143  self.tcDateProcess = None
145 
146 
147  # #Rewrite internal site's fields by another siteObj
148  #
149  # @param siteObj another Site object
150  # @param addListFields bool value that means - extend or overwrite list type fields
151  def rewriteFields(self, siteObj, addListFields=True):
152  excludeFields = ["urls", "filters", "properties"]
153  for field in siteObj.__dict__:
154  if field not in excludeFields and siteObj.__dict__[field] is not None:
155  self.__dict__[field] = siteObj.__dict__[field]
156  for field in excludeFields:
157  if addListFields:
158  if self.__dict__[field] is not None and siteObj.__dict__[field] is not None:
159  self.__dict__[field] += siteObj.__dict__[field]
160  elif siteObj.__dict__[field] is not None:
161  self.__dict__[field] = []
162  self.__dict__[field] += siteObj.__dict__[field]
163  elif siteObj.__dict__[field] is not None:
164  self.__dict__[field] = siteObj.__dict__[field]
165 
166 
167  # #Check item by name in properties container
168  #
169  # @param keyName name
170  # @param properties container
171  @staticmethod
172  def isInProperties(prop, keyName):
173  ret = False
174 
175  if isinstance(prop, dict) and keyName in prop:
176  ret = True
177  else:
178  if isinstance(prop, list):
179  for item in prop:
180  if isinstance(item, dict) and keyName == item["name"]:
181  ret = True
182  break
183 
184  return ret
185 
186 
187  # #Check item by name in properties container
188  #
189  # @param keyName name key of property object item to find
190  # @param fieldName name of property object item field to return, if omitted the keyName used
191  # @param prop container
192  @staticmethod
193  def getFromProperties(prop, keyName, fieldName="value"):
194  ret = None
195 
196  if isinstance(prop, dict) and keyName in prop:
197  ret = prop[keyName]
198  else:
199  if isinstance(prop, list):
200  for item in prop:
201  if isinstance(item, dict) and keyName == item["name"] and fieldName in item:
202  ret = item[fieldName]
203  break
204 
205  return ret
206 
207 
208 
209 # #SiteUpdate event object
210 #
211 # The update site operation object
212 #
214 
215  UPDATE_TYPE_APPEND = 0
216  UPDATE_TYPE_OVERWRITE = 1
217  UPDATE_TYPE_UPDATE = 2
218 
219  # #constructor
220  # initialize fields
221  # @param siteId site identifier
222  #
223  def __init__(self, siteId, updateType=UPDATE_TYPE_APPEND):
224  super(SiteUpdate, self).__init__("")
225  self.updateType = updateType
226  self.id = siteId
227  self.uDate = None
228  self.tcDate = None
229  self.cDate = None
230  self.resources = None
231  self.iterations = None
232  self.description = None
233  self.urls = None
234  self.filters = None
235  self.properties = None
236  self.state = None
237  self.priority = None
238  self.maxURLs = None
239  self.maxResources = None
240  self.maxErrors = None
241  self.maxResourceSize = None
242  self.requestDelay = None
243  self.httpTimeout = None
244  self.errorMask = None
245  self.errors = None
246  self.size = None
247  self.avgSpeed = None
248  self.avgSpeedCounter = None
249  self.urlType = None
250  self.contents = None
251  self.processingDelay = None
252  self.userId = None
253  self.recrawlPeriod = None
254  self.recrawlDate = None
255  self.maxURLsFromPage = None
256  self.criterions = None
257  self.collectedURLs = None
258  self.newURLs = None
259  self.deletedURLs = None
260  self.fetchType = None
261  self.tcDateProcess = None
262  self.categoryId = None
263 
264 
265 
266 # #SiteFind event object
267 #
268 # Get list of Site objects and find them by URL pattern
269 #
271 
272  MAX_NUMBER_DEFAULT = 10
273 
274  CRITERION_LIMIT = "LIMIT"
275  CRITERION_WHERE = "WHERE"
276  CRITERION_ORDER = "ORDER BY"
277  CRITERION_TABLES = "TABLES"
278  DEFAULT_ORDER_BY_CDATE = "CDate DESC"
279  DEFAULT_TABLES = ""
280 
281  # #constructor
282  # initialize fields
283  # @param url pattern of site's root URL that will be find in root url from left string position
284  # @param group bool property defines does results will be grouped from several hosts for one site in one or will be
285  # listed as is
286  # @param maxNumber maximum items for SQL query returned unique sites Ids
287  # @param offset for SQL query returned unique sites Ids
288  #
289  def __init__(self, url, criterions=None):
290  super(SiteFind, self).__init__()
291  self.url = url
292 
293  # Init criterions for Sites
294  if criterions is None:
295  criterions = {}
296  self.criterions = criterions
297 
298  if self.CRITERION_ORDER not in criterions:
300 
301  if self.CRITERION_WHERE not in criterions:
302  self.criterions[self.CRITERION_WHERE] = "1=1"
303 
304  if self.CRITERION_LIMIT not in criterions:
305  self.criterions[self.CRITERION_LIMIT] = str(self.MAX_NUMBER_DEFAULT)
306 
307  if self.CRITERION_TABLES not in criterions:
308  self.criterions[self.CRITERION_TABLES] = str(self.DEFAULT_TABLES)
309 
310  self.excludeList = []
311 
312 
313 
314 # #SiteStatus event object
315 #
316 # The get site status operation object.
317 #
319 
320  # #constructor
321  # initialize fields
322  # @param siteId site identifier, used to get site data from correspondent tables
323  # @param deleteTaskId delete task identifier to check task state, if state if finished get task data request Type=1
324  # to delete task's data from EE before return response. If delete task finished, Site.state=STATE_DELETED returned,
325  # if not - Site.state=STATE_DELETE_TASK returned
326  #
327  def __init__(self, siteId, deleteTaskId=None):
328  super(SiteStatus, self).__init__()
329  self.id = siteId
330  self.deleteTaskId = deleteTaskId
331  self.excludeList = []
332 
333 
334 
335 # #SiteDelete event object
336 #
337 # The delete site operation object.
338 #
340 
341  TASK_TYPE_SYNC = 1
342  TASK_TYPE_ASYNC = 2
343 
344  # #constructor
345  # initialize fields
346  # @param siteId site identifier
347  # @param taskType delete task type - sync or async in specification of DRCE API protocol
348  #
349  def __init__(self, siteId=None, taskType=TASK_TYPE_SYNC, criterions=None):
350  super(SiteDelete, self).__init__()
351  self.id = siteId
352  self.taskType = taskType
353  self.delayedType = NOT_DELAYED_OPERATION
354  if criterions is not None:
355  self.criterions = criterions
356  else:
357  self.criterions = {}
358  if self.id is not None and URLFetch.CRITERION_WHERE not in self.criterions:
359  self.criterions[URLFetch.CRITERION_WHERE] = "`Site_Id=`" + str(self.id)
360 
361 
362 
363 # #SiteCleanup event object
364 #
365 # The cleanup site operation object.
366 #
368 
369  TASK_TYPE_SYNC = 1
370  TASK_TYPE_ASYNC = 2
371 
372  HISTORY_CLEANUP_NOT = 0
373  HISTORY_CLEANUP_LOG = 1
374  HISTORY_CLEANUP_FULL = 2
375 
376  # #constructor
377  # initialize fields
378  # @param siteId site identifier
379  # @param taskType delete task type - sync or async in specification of DRCE API protocol
380  #
381  def __init__(self, siteId, taskType=TASK_TYPE_SYNC):
382  super(SiteCleanup, self).__init__()
383  self.id = siteId
384  self.taskType = taskType
385  self.delayedType = NOT_DELAYED_OPERATION
386  self.moveURLs = True
387  self.saveRootUrls = True
388  self.state = Site.STATE_ACTIVE
390 
391 
392 
393 # #SiteFilter object
394 #
395 # The SiteFilter object.
396 #
398 
399  TYPE_EXCLUDE = 0
400  TYPE_INCLUDE = 1
401 
402  TYPE_DISABLED = 0
403  TYPE_ENABLED = 1
404 
405  TYPE_URL = 0
406  TYPE_MEDIA = 1
407  # #constructor
408  # initialize fields
409  # @param siteId site identifier
410  # @param pattern string
411  # @param ptype type of pattern to enable or to disable if satisfy
412  #
413  def __init__(self, siteId, pattern, ptype=TYPE_INCLUDE, pmode=TYPE_URL, pstate=TYPE_ENABLED):
414  super(SiteFilter, self).__init__()
415  self.siteId = siteId
416  self.pattern = pattern
417  self.subject = ""
418  self.opCode = 0
419  self.stage = 5
420  self.action = 1
421  self.type = ptype
422  self.mode = pmode
423  self.state = pstate
424  self.uDate = None
425  self.cDate = None
426  self.groupId = 0
427 
428 
429 
430 # #URL event object
431 #
432 # The URL event object for operations uses URLs.
433 #
435 
436  # URL states, used by selection condition to crawl and process
437  STATE_ENABLED = 0
438  STATE_DISABLED = 1
439  STATE_ERROR = 2
440 
441  # URL statuses, used by selection condition to crawl and process and to indicate state of operations
442  STATUS_UNDEFINED = 0
443  STATUS_NEW = 1
444  STATUS_SELECTED_CRAWLING = 2
445  STATUS_CRAWLING = 3
446  STATUS_CRAWLED = 4
447  STATUS_SELECTED_PROCESSING = 5
448  STATUS_PROCESSING = 6
449  STATUS_PROCESSED = 7
450  STATUS_SELECTED_CRAWLING_INCREMENTAL = 8
451 
452  # content statuses mask
453  CONTENT_EMPTY = 0
454  CONTENT_STORED_ON_DISK = 1 << 0
455 
456  # Type of collect operation "Regular" - collects URLs and insert only for this site according filters
457  TYPE_REGULAR = 0
458  # Type of collect operation "Single", do not collect URLs
459  TYPE_SINGLE = 1
460  # Type of collect operation "Regular ext." - collects URLs and insert for all sites according filters
461  TYPE_REGULAR_EXT = 2
462  # Type of collect operation "New site" - collect URLs, create sites and insert URLs for all sites according filters
463  TYPE_NEW_SITE = 3
464  # Type of is url fetched already
465  TYPE_FETCHED = 4
466  # Type of is url from real time crawling task
467  TYPE_REAL_TIME_CRAWLER = 5
468  # Type of is url from real time crawling task
469  TYPE_CHAIN = 6
470 
471  # Explicit type means that if site not resolved by Id or Id is empty or None, put URL to general DB table
472  SITE_SELECT_TYPE_EXPLICIT = 0
473  # Auto type means that if site not resolved by ID or Id is empty or None, generate Id by qualified URL domain and
474  # try to identify site and if site not resolved - create new site table using qualified domain name
475  SITE_SELECT_TYPE_AUTO = 1
476  # Qualify URL type means that if site not resolved by Id or empty or None, try to qualify domain name and generate Id
477  # If site not resolved, put URL to general DB table
478  SITE_SELECT_TYPE_QUALIFY_URL = 2
479  SITE_SELECT_TYPE_NONE = 3
480 
481  CONTENT_TYPE_TEXT_HTML = "text/html"
482  CONTENT_TYPE_UNDEFINED = ""
483 
484  URL_NORMALIZE_MASK = UrlNormalizator.NORM_DEFAULT
485 
486  # #constructor
487  # initialize fields
488  # @param siteId site identifier
489  #
490  def __init__(self, siteId, url, state=STATE_ENABLED, urlUpdate=None, normalizeMask=URL_NORMALIZE_MASK):
491  super(URL, self).__init__()
492 
493  self.siteId = siteId
494  self.url = url
495  if url is not None:
496  # normalize url according to RFC 3986
497  self.url = self.getURL(normalizeMask)
498  self.type = self.TYPE_REGULAR
499  self.state = state
500  self.status = self.STATUS_NEW
502  self.crawled = 0
503  self.processed = 0
504  if url is not None:
505  self.urlMd5 = hashlib.md5(self.url).hexdigest()
506  else:
507  self.urlMd5 = None
509  self.requestDelay = 500
510  self.processingDelay = 500
511  self.httpTimeout = 30000
512  self.charset = ""
513  self.batchId = 0
514  self.errorMask = 0
515  self.crawlingTime = 0
516  self.processingTime = 0
517  self.totalTime = 0
518  self.httpCode = 0
519  self.UDate = None
520  self.CDate = None
521  self.httpMethod = "get"
522  self.size = 0
523  self.linksI = 0
524  self.linksE = 0
525  self.freq = 0
526  self.depth = 0
527  self.rawContentMd5 = ""
528  self.parentMd5 = ""
529  self.lastModified = None
530  self.eTag = ""
531  self.mRate = 0.0
532  self.mRateCounter = 0
533  self.tcDate = None
534  self.maxURLsFromPage = 100
536  self.tagsMask = 0
537  self.tagsCount = 0
538  self.pDate = None
539  self.contentURLMd5 = ""
540  self.priority = 0
541  self.urlUpdate = urlUpdate
542  self.urlPut = None
543  self.chainId = None
544  self.classifierMask = 0
545  self.attributes = []
546 
547 
548  # #constructor
549  # initialize fields
550  # @param normalizeMask
551  #
552  def getURL(self, normalizeMask=URL_NORMALIZE_MASK):
553  url = self.url
554  if normalizeMask != UrlNormalizator.NORM_NONE:
555  url = UrlNormalizator.normalize(self.url, None, normalizeMask)
556 
557  return url
558 
559 
560 
561 # #SiteURL event object
562 #
563 # The SiteURL event object for operations uses sites_urls table.
564 #
565 class SiteURL(URL):
566 
567  def __init__(self, siteId, url, stateField=None, normalizeMask=URL.URL_NORMALIZE_MASK):
568  super(SiteURL, self).__init__(siteId, url, stateField, normalizeMask=normalizeMask)
569 
570  self.userId = None
571 
572 
573 
574 # #URLStatus event object
575 #
576 # The URLStatus event object for URL_STATUS operation.
577 #
579 
580  URL_TYPE_URL = 0
581  URL_TYPE_MD5 = 1
582 
583  # #constructor
584  # initialize fields
585  # @param siteId site identifier
586  # @param urlString the URL string according with HTTP spec.
587  #
588  def __init__(self, siteId, urlString):
589  super(URLStatus, self).__init__()
590  self.siteId = siteId
591  self.url = urlString
592  self.urlType = self.URL_TYPE_URL
593 
594 
595 
596 # #URLFetch event object
597 #
598 # The URLFetch event object for fetch URLs operation.
599 #
601 
602  DEFAULT_ALGORITHM = 0
603  PROPORTIONAL_ALGORITHM = 1
604  DEFAULT_LIMIT = 20
605 
606  DEFAULT_ORDER_BY_SITES = "Priority DESC, TcDate ASC"
607  DEFAULT_ORDER_BY_URLS = "CDate ASC"
608 
609  CRITERION_LIMIT = "LIMIT"
610  CRITERION_WHERE = "WHERE"
611  CRITERION_ORDER = "ORDER BY"
612  CRITERION_SQL = "SQL"
613 
614 
615  # #constructor
616  # initialize fields
617  # @param sitesList list of site's identifiers (MD5). If omitted or empty - all sites will take a part
618  # @param urlsCriterions dic of limit name and value. If omitted or empty - the "LIMIT" is set to DEFAULT_LIMIT
619  # @param sitesCriterions dic of limit name and value. If omitted or empty - the "LIMIT" is set to DEFAULT_LIMIT
620  # @param urlUpdate URLUpdate object, if is not None then used to update each URL record after select
621  # @param siteUpdate SiteUpdate object, if is not None then used to update site after select
622  #
623  def __init__(self, sitesList=None, urlsCriterions=None, sitesCriterions=None, urlUpdate=None, siteUpdate=None):
624  super(URLFetch, self).__init__()
625  # Init sites list
626  if sitesList is None:
627  sitesList = []
628  self.sitesList = sitesList
629  # Init criterions for Sites
630  if sitesCriterions is None:
631  sitesCriterions = {}
632  self.sitesCriterions = sitesCriterions
633  if self.CRITERION_ORDER not in sitesCriterions:
635  # Init criterions for URLs
636  if urlsCriterions is None:
637  urlsCriterions = {}
638  self.urlsCriterions = urlsCriterions
639  if self.CRITERION_LIMIT not in urlsCriterions:
641  if self.CRITERION_ORDER not in urlsCriterions:
643  self.urlUpdate = urlUpdate
644  self.maxURLs = self.DEFAULT_LIMIT
646  self.isLocking = True
648  self.siteUpdate = siteUpdate
649  self.attributeNames = ['*']
650 
651 
652 # #URLUpdate event object
653 #
654 # The URLUpdate event object for update operation. Updates only not None value fields
655 #
656 class URLUpdate(URL):
657 
658  # #constructor
659  # initialize fields
660  # @param urlString - a identifier for URL, depends on urlType - HTTP URL or MD5(HTTP URL)
661  # @param urlType - a type of url field value, see the URLStatus.URL_TYPE_URL definition
662  # @param stateField - a state field
663  # @param statusField - a status field
664  # @param normalizeMask - a normalize mask
665  # @param urlObject - a url object
666  #
667  def __init__(self, siteId, urlString, urlType=URLStatus.URL_TYPE_URL, stateField=None, statusField=None,
668  normalizeMask=URL.URL_NORMALIZE_MASK, urlObject=None):
669  if urlObject is None or not isinstance(urlObject, URL):
670  # Init with default
671  if urlType == URLStatus.URL_TYPE_URL:
672  url = urlString
673  else:
674  url = None
675  # super(URLUpdate, self).__init__(siteId, urlString, stateField)
676  super(URLUpdate, self).__init__(siteId=siteId, url=url, state=stateField, normalizeMask=normalizeMask)
677  self.siteId = siteId
678  self.type = None
679  self.state = stateField
680  self.status = statusField
681  self.siteSelect = None
682  self.crawled = None
683  self.processed = None
684  self.fillMD5(urlString, urlType)
685  self.contentType = None
686  self.requestDelay = None
687  self.processingDelay = None
688  self.httpTimeout = None
689  self.charset = None
690  self.batchId = None
691  self.errorMask = None
692  self.crawlingTime = None
693  self.processingTime = None
694  self.totalTime = None
695  self.httpCode = None
696  self.UDate = SQLExpression("NOW()")
697  self.CDate = None
698  self.httpMethod = None
699  self.size = None
700  self.linksI = None
701  self.linksE = None
702  self.freq = None
703  self.depth = None
704  self.rawContentMd5 = None
705  self.parentMd5 = None
706  self.lastModified = None
707  self.eTag = None
708  self.mRate = None
709  self.mRateCounter = None
710  self.tcDate = None
711  self.maxURLsFromPage = None
712  self.priority = None
713  self.tagsCount = None
714  self.contentURLMd5 = None
715  self.tagsMask = None
716  self.chainId = None
717  self.classifierMask = None
718  self.attributes = None
719  # Init criterions
720  self.criterions = {}
721  self.criterions[URLFetch.CRITERION_LIMIT] = 1
722  else:
723  # Init from URL object
724  for name, value in urlObject.__dict__.items():
725  if not name.startswith("__"):
726  if hasattr(self, name) and value is not None:
727  setattr(self, name, value)
728 
729 
730  # #Method fills self.url and self.urlMd5 class fields
731  #
732  # urlString - url
733  # urlType - url's type
734  def fillMD5(self, urlString, urlType):
735  if urlType == URLStatus.URL_TYPE_URL:
736  # Commented out because parent class doing the same
737  # self.url = urlString
738  # self.urlMd5 = hashlib.md5(urlString).hexdigest()
739  pass
740  else:
741  self.url = None
742  self.urlMd5 = urlString
743 
744 
745 
746 # #BatchItem object
747 #
748 # The BatchItem object for batch crawling tasks.
749 #
751 
752  PROP_FEED = "feed"
753 
754  # #constructor
755  # initialize fields
756  # @param siteId md5 string of site Id
757  # @param urlId md5 string of URL Id
758  # @param urlObj the URL object from source SQL db
759  #
760  def __init__(self, siteId, urlId, urlObj, urlPutObj=None, urlContentResponse=None, siteObj=None, depth=0):
761  super(BatchItem, self).__init__()
762  self.siteId = siteId
763  self.urlId = urlId
764  self.urlObj = urlObj
765  self.properties = {}
766 
767  # For demo real time mode
768  self.urlPutObj = urlPutObj
769 
770  # For supporting demo real-time mode algorithms
771  # Algorithms can be:
772  # Only crawling
773  # Only processing
774  # Crawling and processing
775  self.urlContentResponse = urlContentResponse
776 
777  self.siteObj = siteObj
778  self.depth = depth
779 
780 
781 
782 # #Batch event object
783 #
784 # The Batch event object for crawling tasks.
785 #
787 
788  OPERATION_TYPE_NAME = "type"
789  TYPE_NORMAL_CRAWLER = 1
790  TYPE_INCR_CRAWLER = 2
791  TYPE_URLS_RETURN = 3
792  TYPE_REAL_TIME_CRAWLER = 4
793  TYPE_PURGE = 5
794  TYPE_PROCESS = 6
795  TYPE_AGE = 7
796 
797  DB_MODE_RW = 3
798  DB_MODE_R = 1
799  DB_MODE_W = 2
800  DB_MODE_NO = 0
801 
802  # #constructor
803  # initialize fields
804  # @param batchId - the batch Id the same as DRCE task Id
805  # @param batchItems list of BatchItem objects to process
806  #
807  def __init__(self, batchId, batchItems=None, crawlerType=None, dbMode=DB_MODE_RW, maxIterations=1, maxItems=None):
808  super(Batch, self).__init__()
809  self.id = batchId
810  if crawlerType is None:
811  crawlerType = Batch.TYPE_NORMAL_CRAWLER
812  self.crawlerType = crawlerType
813  if batchItems is None:
814  self.items = []
815  else:
816  self.items = batchItems
817  self.errorMask = APP_CONSTS.ERROR_OK
818  self.dbMode = dbMode
819  self.maxIterations = maxIterations
820  self.maxItems = maxItems
823 
824 
825 # #URLDelete event object
826 #
827 # The URLDelete event object for delete operation. Delete URL and all related data including content files,
828 # processed contents and URL registration
829 #
831 
832  REASON_USER_REQUEST = 0
833  REASON_AGING = 1
834  REASON_SITE_LIMITS = 2
835  REASON_SELECT_TO_CRAWL_TTL = 3
836  REASON_SELECT_TO_PROCESS_TTL = 4
837  REASON_RECRAWL = 5
838  REASON_CRAWLER_AUTOREMOVE = 6
839  REASON_SITE_UPDATE_ROOT_URLS = 7
840  REASON_RT_FINALIZER = 10
841  REASON_PROCESSOR_DUPLICATE = 11
842 
843 
844  # #constructor
845  # initialize fields
846  # @param urlString the identifier for URL, depends on urlType - HTTP URL or MD5(HTTP URL)
847  # @param urlType the type of url field value, see the URLStatus.URL_TYPE_URL definition
848  # @param criterions the sql query parts dict, see URLFetch for detailed description
849  def __init__(self, siteId, urlString, urlType=URLStatus.URL_TYPE_URL, criterions=None, reason=REASON_USER_REQUEST):
850  super(URLDelete, self).__init__()
851  self.siteId = siteId
852  self.url = urlString
853  self.urlType = urlType
854  self.criterions = criterions
855  self.delayedType = NOT_DELAYED_OPERATION
856  self.reason = reason
857 
858 
859 
860 # #URLCleanup event object
861 #
862 # The URLCleanup event object for cleanup operation. Also can updates only not None value fields
863 #
865 
866  # #constructor
867  # initialize fields
868  # @param urlString the identifier for URL, depends on urlType - HTTP URL or MD5(HTTP URL)
869  # @param urlType the type of url field value, see the URLStatus.URL_TYPE_URL definition
870  # @param stateField - the state field
871  # @param statusField - the status field
872  #
873  def __init__(self, siteId, urlString, urlType=URLStatus.URL_TYPE_URL, stateField=None, statusField=None,
874  criterions=None):
875  super(URLCleanup, self).__init__()
876  self.siteId = siteId
877  self.url = urlString
878  self.urlType = urlType
879  self.state = stateField
880  self.status = statusField
881  self.criterions = criterions
882  self.delayedType = NOT_DELAYED_OPERATION
883 
884 
885 
886 # #URLContentRequest event object
887 #
888 # The URLContentRequest event object get URL's content operation.
889 #
891 
892  CONTENT_TYPE_PROCESSED = 1 # Return one best value in custom format if exists or default internal if not
893  CONTENT_TYPE_RAW_LAST = 2
894  CONTENT_TYPE_RAW_FIRST = 4
895  CONTENT_TYPE_RAW_ALL = 8
896  CONTENT_TYPE_HEADERS = 16
897  CONTENT_TYPE_REQUESTS = 32
898  CONTENT_TYPE_META = 64
899  CONTENT_TYPE_COOKIES = 128
900  CONTENT_TYPE_TIDY = 256
901  CONTENT_TYPE_DYNAMIC = 512
902  CONTENT_TYPE_RAW = 1024
903  CONTENT_TYPE_CHAIN = 2048
904  CONTENT_TYPE_PROCESSED_INTERNAL = 4096 # Return internal format value(s) one or several
905  CONTENT_TYPE_PROCESSED_CUSTOM = 8192 # Return custom format value(s) one or several
906  CONTENT_TYPE_PROCESSED_ALL = 16384 # Return all values in addition to the CONTENT_TYPE_PROCESSED
907  CONTENT_TYPE_ATTRIBUTES = 32768 # Return attributes
908 
909  URL_TYPE_STRING = 0
910  URL_TYPE_MD5 = 1
911 
912  # #constructor
913  # initialize fields
914  # @param urlString the HTTP URL
915  # @param urlType the type of url field value, always URLStatus.URL_TYPE_URL
916  # @param contentTypeMask - the content types mask defines types of content that will be collected and returned
917  #
918  def __init__(self, siteId, urlString, contentTypeMask=CONTENT_TYPE_PROCESSED + CONTENT_TYPE_RAW_LAST,
919  urlType=URL_TYPE_STRING):
920  super(URLContentRequest, self).__init__()
921  self.siteId = siteId
922  self.url = urlString
923  if urlType == self.URL_TYPE_STRING:
924  self.urlMd5 = self.fillMD5(urlString)
925  else:
926  self.urlMd5 = urlString
927  self.contentTypeMask = contentTypeMask
928  self.urlFetch = None
929  self.attributeNames = ['*']
930  self.dbFieldsList = ["Status", "Crawled", "Processed", "ContentType", "Charset", "ErrorMask", "CrawlingTime",
931  "ProcessingTime", "HTTPCode", "Size", "LinksI", "LinksE", "RawContentMd5", "LastModified",
932  "CDate", "UDate", "TagsMask", "TagsCount", "PDate", "ContentURLMd5", "Batch_Id"]
933 
934  self.dbFieldsListDefaultValues = {"Status":1,
935  "Crawled":0,
936  "Processed":0,
937  "ContentType":"",
938  "Charset":"",
939  "ErrorMask":0,
940  "CrawlingTime":0,
941  "ProcessingTime":0,
942  "HTTPCode":0,
943  "Size":0,
944  "LinksI":0,
945  "LinksE":0,
946  "RawContentMd5":"",
947  "LastModified":None,
948  "CDate":int(time.time()),
949  "UDate":None,
950  "TagsMask":0,
951  "TagsCount":0,
952  "PDate":None,
953  "ContentURLMd5":"",
954  "Batch_Id":0}
955 
956 
957  # #Method fills self.url and self.urlMd5 class fields
958  #
959  # urlString - url
960  def fillMD5(self, urlString):
961  return hashlib.md5(urlString).hexdigest()
962 
963 
964 
965 # #Content object
966 #
967 # The Content object represents content data for URLContentResponse event object.
968 #
970 
971  CONTENT_RAW_CONTENT = 0
972  CONTENT_TIDY_CONTENT = 1
973  CONTENT_HEADERS_CONTENT = 2
974  CONTENT_REQUESTS_CONTENT = 3
975  CONTENT_META_CONTENT = 4
976  CONTENT_COOKIES_CONTENT = 5
977  CONTENT_DYNAMIC_CONTENT = 9
978  CONTENT_PROCESSOR_CONTENT = 10
979  CONTENT_CHAIN_PARTS = 11
980 
981  # #constructor
982  # initialize fields
983  # @param contentBuffer the data buffer
984  # @param cDate the content creation timestamp or zero if not defined
985  #
986  def __init__(self, contentBuffer, cDate=0, typeId=CONTENT_RAW_CONTENT):
987  super(Content, self).__init__()
988  # Init buffer
989  self.buffer = contentBuffer
990  # Init creation date
991  self.cDate = cDate
992  # Contents type
993  self.typeId = typeId
994 
995 
996 
997 # #URLContentResponse event object
998 #
999 # The URLContentResponse event object response on get URL's content operation.
1000 #
1002 
1003  STATUS_OK = 0
1004  STATUS_URL_NOT_FOUND = 1
1005  STATUS_RAW_CONTENT_NOT_FOUND = 2
1006  STATUS_PROCESSED_CONTENT_NOT_FOUND = 3
1007 
1008  # #constructor
1009  # initialize fields
1010  # @param url the HTTP URL of crawled resource
1011  # @param rawContents the list of Content objects for raw crawled files
1012  # @param processedContents the list of Content objects for processed contents, depends on the CONTENT_TYPE_PROCESSED
1013  # CONTENT_TYPE_PROCESSED_INTERNAL, CONTENT_TYPE_PROCESSED_CUSTOM and
1014  # CONTENT_TYPE_PROCESSED_ALL bits. If both the CONTENT_TYPE_PROCESSED_INTERNAL and
1015  # CONTENT_TYPE_PROCESSED_CUSTOM is set - list item is a tuple with Content objects in
1016  # internal and custom formats: [(ContentObjInternal, ContentObjCustom), ...].
1017  # If only CONTENT_TYPE_PROCESSED is set, the custom format returned if exists and internal
1018  # if not.
1019  # @param status the sql db field from `dc_urls`.`urls_SITE_ID_MD5`.`status`
1020  #
1021  def __init__(self, url, rawContents=None, processedContents=None, status=STATUS_OK):
1022  super(URLContentResponse, self).__init__()
1023  # Init url
1024  self.url = url
1025  # Init status
1026  self.status = status
1027  # Init raw contents list
1028  self.rawContents = []
1029  if rawContents is not None:
1030  self.rawContents = rawContents
1031  # Init processed contents list
1033  if processedContents is not None:
1034  self.processedContents = processedContents
1035  # Addition content elements (lists)
1036  self.headers = []
1037  self.requests = []
1038  self.meta = []
1039  self.cookies = []
1040  self.urlMd5 = None
1041  self.rawContentMd5 = None
1042  self.dbFields = {}
1043  self.siteId = 0
1044  self.contentURLMd5 = ""
1045  self.rawContentMd5 = ""
1046  self.itemProperties = None
1047  self.attributes = []
1048 
1049 
1050 
1051 # #ClientResponse event object
1052 #
1053 # The ClientResponse event object to response on any client request.
1054 #
1056 
1057  STATUS_OK = 0
1058  STATUS_ERROR_NONE = 1
1059  STATUS_ERROR_EMPTY_LIST = 2
1060 
1061  # #constructor
1062  # initialize fields
1063  # @param itemsList the list of ClientResponseItem objects
1064  # @param errorCode the general error code, if all item objects are okay it is okay
1065  # @param errorMessage the united error message
1066  #
1067  def __init__(self, itemsList=None, errorCode=STATUS_OK, errorMessage=""):
1068  super(ClientResponse, self).__init__()
1069  # Error code
1070  if itemsList is None:
1071  self.itemsList = []
1072  else:
1073  self.itemsList = itemsList
1074  self.errorCode = errorCode
1075  self.errorMessage = errorMessage
1076 
1077 
1078 
1080 
1081  STATUS_OK = 0
1082  STATUS_ERROR_RESTORE_OBJECT = 1
1083  STATUS_ERROR_DRCE = 2
1084  MSG_ERROR_RESTORE_OBJECT = "Object restore error!"
1085  MSG_ERROR_RESTORE_OBJECT = "DRCE error!"
1086 
1087  # #constructor
1088  # initialize fields
1089  # @param itemObject the item object from the DRCE response
1090  #
1091  def __init__(self, itemObject):
1092  super(ClientResponseItem, self).__init__()
1093  self.itemObject = itemObject
1094  self.errorCode = self.STATUS_OK
1095  self.errorMessage = ""
1096  self.id = 0
1097  self.host = ""
1098  self.port = 0
1099  self.node = ""
1100  self.time = 0
1101 
1102 
1103 
1105 
1106  CRITERION_LIMIT = "LIMIT"
1107  CRITERION_WHERE = "WHERE"
1108  CRITERION_ORDER = "ORDER BY"
1109 
1110  MAX_URLS_TO_DELETE_FROM_SITE = 100
1111  ALL_SITES = -1
1112 
1113  # #constructor
1114  # If the siteId is None the siteLimits must be a tuple: zero item is offset and first item is a number of items to
1115  # purge, for example (0, 100) - means from zero offset of sites list process 100 sites
1116  # @param urlString the identifier for URL, depends on urlType - HTTP URL or MD5(HTTP URL)
1117  # @param urlType the type of url field value, see the URLStatus.URL_TYPE_URL definition
1118  # @param criterions the sql query parts dict, see URLFetch for detailed description, for URLs
1119  def __init__(self, siteId, urlString, urlType=URLStatus.URL_TYPE_URL, criterions=None):
1120  super(URLPurge, self).__init__()
1121  self.siteId = siteId
1122  self.url = urlString
1123  self.urlType = urlType
1124  if criterions is None:
1125  criterions = {self.CRITERION_LIMIT:self.MAX_URLS_TO_DELETE_FROM_SITE}
1126  self.criterions = criterions
1127  self.siteLimits = None
1128 
1129 
1130 
1132 
1133  FULL_RECALC = 0
1134  PARTITION_RECALC = 1
1135 
1136  # #constructor
1137  def __init__(self, siteId, recalcType=FULL_RECALC, criterions=None):
1138  super(FieldRecalculatorObj, self).__init__()
1139  self.siteId = siteId
1140  self.recalcType = recalcType
1141  self.criterions = criterions
1142 
1143 
1144 
1146 
1147  # #constructor
1148  def __init__(self, siteId, urlString, dbName, urlType=URLStatus.URL_TYPE_URL, criterions=None):
1149  super(URLVerify, self).__init__()
1150  self.siteId = siteId
1151  self.url = urlString
1152  self.dbName = dbName
1153  self.urlType = urlType
1154  self.criterions = criterions
1155 
1156 
1157 
1158 # #URLAge event object used to make request of URLAge operation
1159 #
1160 # The URLAge operation object performs delete URLs by aging condition
1161 #
1163 
1164  CRITERION_LIMIT = "LIMIT"
1165  CRITERION_WHERE = "WHERE"
1166  CRITERION_ORDER = "ORDER BY"
1167 
1168  MAX_URLS_TO_DELETE_FROM_SITE = 100
1169  MAX_SITES_TO_SELECT = 10
1170  DEFAULT_LIMIT = 100
1171 
1172  # #constructor
1173  # @param urlsCriterions - criterions for fetching urls from MySQL db (dc_urls db)
1174  # @param sitesCriterions - criterions for fetching sites from MySQL db (db_sites db)
1175  def __init__(self, urlsCriterions=None, sitesCriterions=None):
1176  super(URLAge, self).__init__()
1177  if urlsCriterions is None:
1178  urlsCriterions = {self.CRITERION_LIMIT:self.MAX_URLS_TO_DELETE_FROM_SITE}
1179  self.urlsCriterions = urlsCriterions
1180  if sitesCriterions is None:
1181  sitesCriterions = {self.CRITERION_LIMIT:self.MAX_SITES_TO_SELECT}
1182  self.sitesCriterions = sitesCriterions
1184  self.delayedType = NOT_DELAYED_OPERATION
1185 
1186 
1187 
1188 # #DataFetchRequest incoming (request) class for db storage request
1189 #
1190 #
1192 
1193  # #constructor
1194  # @param siteId - siteId
1195  # @param urlMd5 - url's md5
1196  # @param criterions - addition SQL criterions
1197  def __init__(self, siteId, urlMd5, criterions=None):
1198  super(DataFetchRequest, self).__init__()
1199  self.siteId = siteId
1200  self.urlMd5 = urlMd5
1201  self.criterions = criterions
1202 
1203 
1204 
1205 # #DataFetchResponse outgoing (response) class for db storage request
1206 #
1207 #
1209 
1210  # #constructor
1211  # @param resultDict - db storage result fields
1212  # @param errCode - operation error code
1213  # @param errMessage - operation error message
1214  def __init__(self, resultDict, errCode=0, errMessage=""):
1215  super(DataFetchResponse, self).__init__()
1216  self.resultDict = resultDict
1217  self.errCode = errCode
1218  self.errMessage = errMessage
1219 
1220 
1221 
1222 # #DataDeleteRequest incoming (request) class for db storage [Data delte operations]
1223 #
1224 #
1226 
1227  # #constructor
1228  # @param siteId - siteId
1229  # @param filesSuffix - suffix of storage file
1230  # @param urlMd5 - url's md5
1231  def __init__(self, siteId, urlMd5, filesSuffix):
1232  super(DataDeleteRequest, self).__init__()
1233  self.siteId = siteId
1234  self.filesSuffix = filesSuffix
1235  self.urlMd5 = urlMd5
1236 
1237 
1238 
1239 # #DataDeleteResponse outgoing (response) class for db storage [Data delte operations]
1240 #
1241 #
1243 
1244  # #constructor
1245  # @param errCode - operation error code
1246  # @param errMessage - operation error message
1247  def __init__(self, errCode=0, errMessage=""):
1248  super(DataDeleteResponse, self).__init__()
1249  self.errCode = errCode
1250  self.errMessage = errMessage
1251 
1252 
1253 
1254 # #DataDeleteRequest incoming (request) class for db storage [Data delte operations]
1255 #
1256 #
1258 
1259  # #constructor
1260  # @param siteId - siteId
1261  # @param filesSuffix - suffix of storage file
1262  # @param urlMd5 - url's md5
1263  def __init__(self, siteId, urlMd5, filesSuffix):
1264  super(DataCreateRequest, self).__init__()
1265  self.siteId = siteId
1266  self.filesSuffix = filesSuffix
1267  self.urlMd5 = urlMd5
1268 
1269 
1270 
1271 # #DataDeleteResponse outgoing (response) class for db storage [Data delte operations]
1272 #
1273 #
1275 
1276  # #constructor
1277  # @param errCode - operation error code
1278  # @param errMessage - operation error message
1279  def __init__(self, errCode=0, errMessage=""):
1280  super(DataCreateResponse, self).__init__()
1281  self.errCode = errCode
1282  self.errMessage = errMessage
1283 
1284 
1285 
1286 # #URLPutRequest incoming (request) class for db storage [Data delte operations]
1287 #
1288 #
1290 
1291  # #constructor
1292  # @param siteId - siteId
1293  # @param urlMd5 - url's md5
1294  # @param CDate - url's CDate
1295  # @param criterions - criterions for urlMd5's fetching (if urlMd5 is None)
1296  def __init__(self, siteId, urlMd5, contentType, putDict=None, criterions=None, fileStorageSuffix=None):
1297  super(URLPut, self).__init__()
1298  if putDict is None:
1299  putDict = {}
1300  self.siteId = siteId
1301  self.urlMd5 = urlMd5
1302  self.putDict = putDict
1303  self.criterions = criterions
1304  self.contentType = contentType
1305  self.fileStorageSuffix = fileStorageSuffix
1306 
1307 
1308 
1309 # #DataDeleteResponse outgoing (response) class for db storage [Data delte operations]
1310 #
1311 #
1313 
1314  # #constructor
1315  # @param errCode - operation error code
1316  # @param errMessage - operation error message
1317  def __init__(self, contentType, errCode=0, errMessage=""):
1318  super(URLPutResponse, self).__init__()
1319  self.contentType = contentType
1320  self.errCode = errCode
1321  self.errMessage = errMessage
1322 
1323 
1324 
1325 # #URLHistoryRequest event object
1326 #
1327 # The URLHistoryRequest event object get URL's history operation.
1328 #
1330 
1331  CRITERION_LIMIT = "LIMIT"
1332  CRITERION_WHERE = "WHERE"
1333  CRITERION_ORDER = "ORDER BY"
1334  DEFAULT_ORDER = "ODate ASC"
1335  DEFAULT_WHERE = "URLMD5='%URL%'"
1336  DEFAULT_LIMIT = 100
1337 
1338  # #constructor
1339  # initialize fields
1340  # @param siteId the Site Id
1341  # @param urlMd5 the md5(url) or None to avoid seletcion for one URL only
1342  # @param criterions the criterions for the SQL request
1343  #
1344  def __init__(self, siteId, urlMd5=None, urlCriterions=None, logCriterions=None):
1345  super(URLHistoryRequest, self).__init__()
1346  self.siteId = siteId
1347  self.urlMd5 = urlMd5
1348  if urlCriterions is None:
1349  self.urlCriterions = {}
1350  else:
1351  self.urlCriterions = urlCriterions
1352  if self.CRITERION_LIMIT not in self.urlCriterions and urlCriterions is not None:
1353  self.urlCriterions[self.CRITERION_LIMIT] = str(self.DEFAULT_LIMIT)
1354  if logCriterions is None:
1355  self.logCriterions = {}
1356  else:
1357  self.logCriterions = logCriterions
1358  if self.CRITERION_ORDER not in self.logCriterions and logCriterions is not None:
1359  self.logCriterions[self.CRITERION_ORDER] = self.DEFAULT_ORDER
1360 
1361 
1362 
1363 # #URLHistoryResponse event object
1364 #
1365 # The URLHistoryResponse event object represents the response of the URL's history operation.
1366 
1368 
1369  # #constructor
1370  # initialize fields
1371  # @param logRows the list of items of selected rows from the `dc_stat_logs`.`log_SITE_ID` table as is
1372  #
1373  def __init__(self, logRows=None, siteId=None):
1374  super(URLHistoryResponse, self).__init__()
1375  self.siteId = siteId
1376  if logRows is None:
1377  self.logRows = []
1378  else:
1379  self.logRows = logRows
1380 
1381 
1382 
1383 # #URLHistoryRequest event object
1384 #
1385 # The URLHistoryRequest event object get URL's history operation.
1386 #
1388 
1389  CRITERION_LIMIT = "LIMIT"
1390  CRITERION_WHERE = "WHERE"
1391  CRITERION_ORDER = "ORDER BY"
1392  DEFAULT_ORDER = "ODate ASC"
1393  DEFAULT_WHERE = "URLMD5='%URL%'"
1394  DEFAULT_LIMIT = 100
1395 
1396  # #constructor
1397  # initialize fields
1398  # @param siteId the Site Id
1399  # @param urlMd5 the md5(url) or None to avoid seletcion for one URL only
1400  # @param criterions the criterions for the SQL request
1401  #
1402  def __init__(self, siteId, urlMd5=None, urlCriterions=None, statsCriterions=None):
1403  super(URLStatsRequest, self).__init__()
1404  self.siteId = siteId
1405  self.urlMd5 = urlMd5
1406  if urlCriterions is None:
1407  self.urlCriterions = {}
1408  else:
1409  self.urlCriterions = urlCriterions
1410  if self.CRITERION_LIMIT not in self.urlCriterions and urlCriterions is not None:
1411  self.urlCriterions[self.CRITERION_LIMIT] = str(self.DEFAULT_LIMIT)
1412  if statsCriterions is None:
1414  else:
1415  self.statsCriterions = statsCriterions
1416 
1417 
1418 
1419 # #URLStatsResponse event object
1420 #
1421 # The URLStatsResponse event object represents the response of the URL's stats operation.
1423 
1424  # #constructor
1425  # initialize fields
1426  # @param freqRows the list of items of selected rows from the `dc_stat_freq`.`freq_SITE_ID` table as is
1427  #
1428  def __init__(self, freqRows=None, siteId=None):
1429  super(URLStatsResponse, self).__init__()
1430  self.siteId = siteId
1431  if freqRows is None:
1432  self.freqRows = []
1433  else:
1434  self.freqRows = freqRows
1435 
1436 
1437 # #Proxy event object
1438 #
1439 # The Proxy event object represents sites_proxy table element
1441 
1442  # #constructor
1443  def __init__(self, siteId, host):
1444  super(Proxy, self).__init__()
1445  self.id = 0
1446  self.siteId = siteId
1447  self.host = host
1448  self.domains = None
1449  self.priority = 0
1450  self.state = 0
1451  self.countryCode = ""
1452  self.countryName = ""
1453  self.regionCode = 0
1454  self.regionName = ""
1455  self.cityName = ""
1456  self.zipCode = ""
1457  self.timeZone = ""
1458  self.latitude = 0.0
1459  self.longitude = 0.0
1460  self.metroCode = 0
1461  self.faults = 0
1462  self.faultsMax = 0
1463  self.categoryId = 0
1464  self.limits = None
1465  self.description = ""
1466  self.cDate = int(time.time())
1467  self.uDate = SQLExpression("NOW()")
1468 
1469 
1470 # #ProxyUpdate event object
1471 #
1472 # The ProxyUpdate event object which updates fields in Proxy table
1474 
1475  # #constructor
1476  def __init__(self, siteId, host):
1477  super(ProxyUpdate, self).__init__(siteId, host)
1478  self.id = 0
1479  self.siteId = siteId
1480  self.host = host
1481  self.domains = None
1482  self.priority = 0
1483  self.state = 0
1484  self.countryCode = ""
1485  self.countryName = ""
1486  self.regionCode = 0
1487  self.regionName = ""
1488  self.cityName = ""
1489  self.zipCode = ""
1490  self.timeZone = ""
1491  self.latitude = 0.0
1492  self.longitude = 0.0
1493  self.metroCode = 0
1494  self.faults = 0
1495  self.faultsMax = 0
1496  self.categoryId = 0
1497  self.limits = None
1498  self.description = ""
1499  self.cDate = int(time.time())
1500  self.uDate = SQLExpression("NOW()")
1501 
1502 
1503 # #ProxyDelete event object
1504 #
1505 # The ProxyDelete event object which deletes fields in Proxy table
1507 
1508  # #constructor
1509  def __init__(self, siteId=None, host=None, criterions=None):
1510  super(ProxyDelete, self).__init__()
1511  self.siteId = siteId
1512  self.host = host
1513  if criterions is not None:
1514  self.criterions = criterions
1515  else:
1516  self.criterions = {}
1517  if self.siteId is not None and URLFetch.CRITERION_WHERE not in self.criterions:
1518  self.criterions[URLFetch.CRITERION_WHERE] = "`Site_Id=`" + str(self.siteId)
1519 
1520 
1521 # #ProxyStatus event object
1522 #
1523 # The ProxyStatus event object which deletes fields in Proxy table
1525 
1526  # #constructor
1527  def __init__(self, siteId=None, host=None, criterions=None):
1528  super(ProxyStatus, self).__init__()
1529  self.siteId = siteId
1530  self.host = host
1531  if criterions is not None:
1532  self.criterions = criterions
1533  else:
1534  self.criterions = {}
1535  if self.siteId is not None and URLFetch.CRITERION_WHERE not in self.criterions:
1536  self.criterions[URLFetch.CRITERION_WHERE] = "`Site_Id=`" + str(self.siteId)
1537 
1538 
1539 
1540 # #ProxyFind event object
1541 #
1542 # The ProxyFind event object which finds and returns proxies by criterions
1544 
1545  # #constructor
1546  def __init__(self, siteId=None, criterions=None, siteCriterions=None):
1547  super(ProxyFind, self).__init__()
1548  self.siteId = siteId
1549  if criterions is not None:
1550  self.criterions = criterions
1551  else:
1552  self.criterions = {}
1553  if siteCriterions is not None:
1554  self.siteCriterions = siteCriterions
1555  else:
1556  self.siteCriterions = {}
1557  if self.siteId is not None and URLFetch.CRITERION_WHERE not in self.criterions:
1558  self.criterions[URLFetch.CRITERION_WHERE] = "`Site_Id=`" + str(self.siteId)
1559 
1560 
1561 
1562 # #Attribute event object
1563 #
1564 # The Attribute event object for operations uses Attributes.
1565 #
1567 
1568  # #constructor
1569  # initialize fields
1570  # @param siteId site identifier
1571  # @param Name attribute's name
1572  #
1573  def __init__(self, siteId, name, urlMd5='', value='', cDate=None):
1574  super(Attribute, self).__init__()
1575  self.siteId = siteId
1576  self.name = name
1577  self.urlMd5 = urlMd5
1578  self.value = value
1579  self.cDate = cDate
1580 
1581 
1582 # #AttributeUpdate event object
1583 #
1584 # The AttributeUpdate event object for update operation. Updates only not None value fields
1585 #
1587 
1588  # #constructor
1589  # initialize fields
1590  # @param siteId site identifier
1591  # @param Name attribute's name
1592  #
1593  def __init__(self, siteId, name):
1594  super(AttributeUpdate, self).__init__(siteId, name)
1595  self.siteId = siteId
1596  self.name = name
1597  self.urlMd5 = None
1598  self.value = None
1599  self.cDate = None
1600 
1601 
1602 # #AttributeDelete event object
1603 #
1604 # The AttributeDelete event object for delete operation. Deletes attributes by name or by criterions
1605 #
1607 
1608  # #constructor
1609  # initialize fields
1610  # @param siteId site identifier
1611  # @param Name attribute's name
1612  #
1613  def __init__(self, siteId, name=None, criterions=None):
1614  super(AttributeDelete, self).__init__()
1615  self.siteId = siteId
1616  self.name = name
1617  if criterions is not None:
1618  self.criterions = criterions
1619  else:
1620  self.criterions = {}
1621 
1622 
1623 
1624 # #AttributeFetch event object
1625 #
1626 # The AttributeFetch event object for fetch operation. Fetches attributes by name or by criterions
1627 #
1629 
1630  # #constructor
1631  # initialize fields
1632  # @param siteId site identifier
1633  # @param Name attribute's name
1634  #
1635  def __init__(self, siteId, name=None, criterions=None):
1636  super(AttributeFetch, self).__init__()
1637  self.siteId = siteId
1638  self.name = name
1639  if criterions is not None:
1640  self.criterions = criterions
1641  else:
1642  self.criterions = {}
def getFromProperties(prop, keyName, fieldName="value")
def __init__(self, siteId, urlMd5=None, urlCriterions=None, statsCriterions=None)
string CONTENT_TYPE_UNDEFINED
def __init__(self, url, criterions=None)
def __init__(self, siteId, urlMd5, filesSuffix)
def __init__(self, siteId, urlMd5, contentType, putDict=None, criterions=None, fileStorageSuffix=None)
def __init__(self, contentBuffer, cDate=0, typeId=CONTENT_RAW_CONTENT)
def __init__(self, siteId, name, urlMd5='', value='', cDate=None)
def __init__(self, siteId, urlString, contentTypeMask=CONTENT_TYPE_PROCESSED+CONTENT_TYPE_RAW_LAST, urlType=URL_TYPE_STRING)
def __init__(self, batchId, batchItems=None, crawlerType=None, dbMode=DB_MODE_RW, maxIterations=1, maxItems=None)
def __init__(self, siteId, name=None, criterions=None)
def __init__(self, siteId=None, criterions=None, siteCriterions=None)
def __init__(self, url, _userId=0)
Definition: EventObjects.py:58
def __init__(self, errCode=0, errMessage="")
def __init__(self, urlsCriterions=None, sitesCriterions=None)
def __init__(self, resultDict, errCode=0, errMessage="")
def __init__(self, siteId, name=None, criterions=None)
def __init__(self, siteId, urlId, urlObj, urlPutObj=None, urlContentResponse=None, siteObj=None, depth=0)
def __init__(self, siteId, urlString, urlType=URLStatus.URL_TYPE_URL, stateField=None, statusField=None, criterions=None)
def __init__(self, siteId, updateType=UPDATE_TYPE_APPEND)
def __init__(self, siteId=None, taskType=TASK_TYPE_SYNC, criterions=None)
def __init__(self, siteId, taskType=TASK_TYPE_SYNC)
def __init__(self, siteId, urlString, urlType=URLStatus.URL_TYPE_URL, criterions=None)
def __init__(self, siteId, urlMd5, filesSuffix)
def isInProperties(prop, keyName)
def fillMD5(self, urlString, urlType)
def __init__(self, siteId, url, state=STATE_ENABLED, urlUpdate=None, normalizeMask=URL_NORMALIZE_MASK)
def __init__(self, siteId, urlMd5, criterions=None)
def __init__(self, siteId, recalcType=FULL_RECALC, criterions=None)
def __init__(self, url, rawContents=None, processedContents=None, status=STATUS_OK)
def __init__(self, siteId, deleteTaskId=None)
def __init__(self, siteId, pattern, ptype=TYPE_INCLUDE, pmode=TYPE_URL, pstate=TYPE_ENABLED)
def __init__(self, siteId, host)
def __init__(self, itemObject)
def fillMD5(self, urlString)
def __init__(self, siteId, urlString)
def __init__(self, logRows=None, siteId=None)
def __init__(self, siteId, urlString, dbName, urlType=URLStatus.URL_TYPE_URL, criterions=None)
def __init__(self, siteId=None, host=None, criterions=None)
def __init__(self, contentType, errCode=0, errMessage="")
def __init__(self, sitesList=None, urlsCriterions=None, sitesCriterions=None, urlUpdate=None, siteUpdate=None)
def __init__(self, siteId=None, host=None, criterions=None)
def __init__(self, itemsList=None, errorCode=STATUS_OK, errorMessage="")
def __init__(self, siteId, host)
def __init__(self, freqRows=None, siteId=None)
def getURL(self, normalizeMask=URL_NORMALIZE_MASK)
def rewriteFields(self, siteObj, addListFields=True)
def __init__(self, siteId, url, stateField=None, normalizeMask=URL.URL_NORMALIZE_MASK)
def __init__(self, siteId, name)
def __init__(self, siteId, urlString, urlType=URLStatus.URL_TYPE_URL, criterions=None, reason=REASON_USER_REQUEST)
def __init__(self, errCode=0, errMessage="")
def __init__(self, siteId, urlString, urlType=URLStatus.URL_TYPE_URL, stateField=None, statusField=None, normalizeMask=URL.URL_NORMALIZE_MASK, urlObject=None)
def __init__(self, siteId, urlMd5=None, urlCriterions=None, logCriterions=None)