HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc.EventObjects.URL Class Reference
Inheritance diagram for dc.EventObjects.URL:
Collaboration diagram for dc.EventObjects.URL:

Public Member Functions

def __init__ (self, siteId, url, state=STATE_ENABLED, urlUpdate=None, normalizeMask=URL_NORMALIZE_MASK)
 
def getURL (self, normalizeMask=URL_NORMALIZE_MASK)
 
- Public Member Functions inherited from app.Utils.JsonSerializable
def __init__ (self)
 
def toJSON (self)
 

Public Attributes

 siteId
 
 url
 
 type
 
 state
 
 status
 
 siteSelect
 
 crawled
 
 processed
 
 urlMd5
 
 contentType
 
 requestDelay
 
 processingDelay
 
 httpTimeout
 
 charset
 
 batchId
 
 errorMask
 
 crawlingTime
 
 processingTime
 
 totalTime
 
 httpCode
 
 UDate
 
 CDate
 
 httpMethod
 
 size
 
 linksI
 
 linksE
 
 freq
 
 depth
 
 rawContentMd5
 
 parentMd5
 
 lastModified
 
 eTag
 
 mRate
 
 mRateCounter
 
 tcDate
 
 maxURLsFromPage
 
 contentMask
 
 tagsMask
 
 tagsCount
 
 pDate
 
 contentURLMd5
 
 priority
 
 urlUpdate
 
 urlPut
 
 chainId
 
 classifierMask
 
 attributes
 

Static Public Attributes

int STATE_ENABLED = 0
 
int STATE_DISABLED = 1
 
int STATE_ERROR = 2
 
int STATUS_UNDEFINED = 0
 
int STATUS_NEW = 1
 
int STATUS_SELECTED_CRAWLING = 2
 
int STATUS_CRAWLING = 3
 
int STATUS_CRAWLED = 4
 
int STATUS_SELECTED_PROCESSING = 5
 
int STATUS_PROCESSING = 6
 
int STATUS_PROCESSED = 7
 
int STATUS_SELECTED_CRAWLING_INCREMENTAL = 8
 
int CONTENT_EMPTY = 0
 
int CONTENT_STORED_ON_DISK = 1 << 0
 
int TYPE_REGULAR = 0
 
int TYPE_SINGLE = 1
 
int TYPE_REGULAR_EXT = 2
 
int TYPE_NEW_SITE = 3
 
int TYPE_FETCHED = 4
 
int TYPE_REAL_TIME_CRAWLER = 5
 
int TYPE_CHAIN = 6
 
int SITE_SELECT_TYPE_EXPLICIT = 0
 
int SITE_SELECT_TYPE_AUTO = 1
 
int SITE_SELECT_TYPE_QUALIFY_URL = 2
 
int SITE_SELECT_TYPE_NONE = 3
 
string CONTENT_TYPE_TEXT_HTML = "text/html"
 
string CONTENT_TYPE_UNDEFINED = ""
 
 URL_NORMALIZE_MASK = UrlNormalizator.NORM_DEFAULT
 

Additional Inherited Members

- Static Public Member Functions inherited from app.Utils.JsonSerializable
def json_serial (obj)
 

Detailed Description

Definition at line 434 of file EventObjects.py.

Constructor & Destructor Documentation

◆ __init__()

def dc.EventObjects.URL.__init__ (   self,
  siteId,
  url,
  state = STATE_ENABLED,
  urlUpdate = None,
  normalizeMask = URL_NORMALIZE_MASK 
)

Definition at line 490 of file EventObjects.py.

490  def __init__(self, siteId, url, state=STATE_ENABLED, urlUpdate=None, normalizeMask=URL_NORMALIZE_MASK):
491  super(URL, self).__init__()
492 
493  self.siteId = siteId
494  self.url = url
495  if url is not None:
496  # normalize url according to RFC 3986
497  self.url = self.getURL(normalizeMask)
498  self.type = self.TYPE_REGULAR
499  self.state = state
500  self.status = self.STATUS_NEW
501  self.siteSelect = self.SITE_SELECT_TYPE_NONE
502  self.crawled = 0
503  self.processed = 0
504  if url is not None:
505  self.urlMd5 = hashlib.md5(self.url).hexdigest()
506  else:
507  self.urlMd5 = None
508  self.contentType = self.CONTENT_TYPE_UNDEFINED
509  self.requestDelay = 500
510  self.processingDelay = 500
511  self.httpTimeout = 30000
512  self.charset = ""
513  self.batchId = 0
514  self.errorMask = 0
515  self.crawlingTime = 0
516  self.processingTime = 0
517  self.totalTime = 0
518  self.httpCode = 0
519  self.UDate = None
520  self.CDate = None
521  self.httpMethod = "get"
522  self.size = 0
523  self.linksI = 0
524  self.linksE = 0
525  self.freq = 0
526  self.depth = 0
527  self.rawContentMd5 = ""
528  self.parentMd5 = ""
529  self.lastModified = None
530  self.eTag = ""
531  self.mRate = 0.0
532  self.mRateCounter = 0
533  self.tcDate = None
534  self.maxURLsFromPage = 100
535  self.contentMask = self.CONTENT_EMPTY
536  self.tagsMask = 0
537  self.tagsCount = 0
538  self.pDate = None
539  self.contentURLMd5 = ""
540  self.priority = 0
541  self.urlUpdate = urlUpdate
542  self.urlPut = None
543  self.chainId = None
544  self.classifierMask = 0
545  self.attributes = []
546 
547 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ getURL()

def dc.EventObjects.URL.getURL (   self,
  normalizeMask = URL_NORMALIZE_MASK 
)

Definition at line 552 of file EventObjects.py.

552  def getURL(self, normalizeMask=URL_NORMALIZE_MASK):
553  url = self.url
554  if normalizeMask != UrlNormalizator.NORM_NONE:
555  url = UrlNormalizator.normalize(self.url, None, normalizeMask)
556 
557  return url
558 
559 
560 
561 # #SiteURL event object
562 #
563 # The SiteURL event object for operations uses sites_urls table.
564 #
Here is the caller graph for this function:

Member Data Documentation

◆ attributes

dc.EventObjects.URL.attributes

Definition at line 545 of file EventObjects.py.

◆ batchId

dc.EventObjects.URL.batchId

Definition at line 513 of file EventObjects.py.

◆ CDate

dc.EventObjects.URL.CDate

Definition at line 520 of file EventObjects.py.

◆ chainId

dc.EventObjects.URL.chainId

Definition at line 543 of file EventObjects.py.

◆ charset

dc.EventObjects.URL.charset

Definition at line 512 of file EventObjects.py.

◆ classifierMask

dc.EventObjects.URL.classifierMask

Definition at line 544 of file EventObjects.py.

◆ CONTENT_EMPTY

int dc.EventObjects.URL.CONTENT_EMPTY = 0
static

Definition at line 453 of file EventObjects.py.

◆ CONTENT_STORED_ON_DISK

int dc.EventObjects.URL.CONTENT_STORED_ON_DISK = 1 << 0
static

Definition at line 454 of file EventObjects.py.

◆ CONTENT_TYPE_TEXT_HTML

string dc.EventObjects.URL.CONTENT_TYPE_TEXT_HTML = "text/html"
static

Definition at line 481 of file EventObjects.py.

◆ CONTENT_TYPE_UNDEFINED

string dc.EventObjects.URL.CONTENT_TYPE_UNDEFINED = ""
static

Definition at line 482 of file EventObjects.py.

◆ contentMask

dc.EventObjects.URL.contentMask

Definition at line 535 of file EventObjects.py.

◆ contentType

dc.EventObjects.URL.contentType

Definition at line 508 of file EventObjects.py.

◆ contentURLMd5

dc.EventObjects.URL.contentURLMd5

Definition at line 539 of file EventObjects.py.

◆ crawled

dc.EventObjects.URL.crawled

Definition at line 502 of file EventObjects.py.

◆ crawlingTime

dc.EventObjects.URL.crawlingTime

Definition at line 515 of file EventObjects.py.

◆ depth

dc.EventObjects.URL.depth

Definition at line 526 of file EventObjects.py.

◆ errorMask

dc.EventObjects.URL.errorMask

Definition at line 514 of file EventObjects.py.

◆ eTag

dc.EventObjects.URL.eTag

Definition at line 530 of file EventObjects.py.

◆ freq

dc.EventObjects.URL.freq

Definition at line 525 of file EventObjects.py.

◆ httpCode

dc.EventObjects.URL.httpCode

Definition at line 518 of file EventObjects.py.

◆ httpMethod

dc.EventObjects.URL.httpMethod

Definition at line 521 of file EventObjects.py.

◆ httpTimeout

dc.EventObjects.URL.httpTimeout

Definition at line 511 of file EventObjects.py.

◆ lastModified

dc.EventObjects.URL.lastModified

Definition at line 529 of file EventObjects.py.

◆ linksE

dc.EventObjects.URL.linksE

Definition at line 524 of file EventObjects.py.

◆ linksI

dc.EventObjects.URL.linksI

Definition at line 523 of file EventObjects.py.

◆ maxURLsFromPage

dc.EventObjects.URL.maxURLsFromPage

Definition at line 534 of file EventObjects.py.

◆ mRate

dc.EventObjects.URL.mRate

Definition at line 531 of file EventObjects.py.

◆ mRateCounter

dc.EventObjects.URL.mRateCounter

Definition at line 532 of file EventObjects.py.

◆ parentMd5

dc.EventObjects.URL.parentMd5

Definition at line 528 of file EventObjects.py.

◆ pDate

dc.EventObjects.URL.pDate

Definition at line 538 of file EventObjects.py.

◆ priority

dc.EventObjects.URL.priority

Definition at line 540 of file EventObjects.py.

◆ processed

dc.EventObjects.URL.processed

Definition at line 503 of file EventObjects.py.

◆ processingDelay

dc.EventObjects.URL.processingDelay

Definition at line 510 of file EventObjects.py.

◆ processingTime

dc.EventObjects.URL.processingTime

Definition at line 516 of file EventObjects.py.

◆ rawContentMd5

dc.EventObjects.URL.rawContentMd5

Definition at line 527 of file EventObjects.py.

◆ requestDelay

dc.EventObjects.URL.requestDelay

Definition at line 509 of file EventObjects.py.

◆ SITE_SELECT_TYPE_AUTO

int dc.EventObjects.URL.SITE_SELECT_TYPE_AUTO = 1
static

Definition at line 475 of file EventObjects.py.

◆ SITE_SELECT_TYPE_EXPLICIT

int dc.EventObjects.URL.SITE_SELECT_TYPE_EXPLICIT = 0
static

Definition at line 472 of file EventObjects.py.

◆ SITE_SELECT_TYPE_NONE

int dc.EventObjects.URL.SITE_SELECT_TYPE_NONE = 3
static

Definition at line 479 of file EventObjects.py.

◆ SITE_SELECT_TYPE_QUALIFY_URL

int dc.EventObjects.URL.SITE_SELECT_TYPE_QUALIFY_URL = 2
static

Definition at line 478 of file EventObjects.py.

◆ siteId

dc.EventObjects.URL.siteId

Definition at line 493 of file EventObjects.py.

◆ siteSelect

dc.EventObjects.URL.siteSelect

Definition at line 501 of file EventObjects.py.

◆ size

dc.EventObjects.URL.size

Definition at line 522 of file EventObjects.py.

◆ state

dc.EventObjects.URL.state

Definition at line 499 of file EventObjects.py.

◆ STATE_DISABLED

int dc.EventObjects.URL.STATE_DISABLED = 1
static

Definition at line 438 of file EventObjects.py.

◆ STATE_ENABLED

int dc.EventObjects.URL.STATE_ENABLED = 0
static

Definition at line 437 of file EventObjects.py.

◆ STATE_ERROR

int dc.EventObjects.URL.STATE_ERROR = 2
static

Definition at line 439 of file EventObjects.py.

◆ status

dc.EventObjects.URL.status

Definition at line 500 of file EventObjects.py.

◆ STATUS_CRAWLED

int dc.EventObjects.URL.STATUS_CRAWLED = 4
static

Definition at line 446 of file EventObjects.py.

◆ STATUS_CRAWLING

int dc.EventObjects.URL.STATUS_CRAWLING = 3
static

Definition at line 445 of file EventObjects.py.

◆ STATUS_NEW

int dc.EventObjects.URL.STATUS_NEW = 1
static

Definition at line 443 of file EventObjects.py.

◆ STATUS_PROCESSED

int dc.EventObjects.URL.STATUS_PROCESSED = 7
static

Definition at line 449 of file EventObjects.py.

◆ STATUS_PROCESSING

int dc.EventObjects.URL.STATUS_PROCESSING = 6
static

Definition at line 448 of file EventObjects.py.

◆ STATUS_SELECTED_CRAWLING

int dc.EventObjects.URL.STATUS_SELECTED_CRAWLING = 2
static

Definition at line 444 of file EventObjects.py.

◆ STATUS_SELECTED_CRAWLING_INCREMENTAL

int dc.EventObjects.URL.STATUS_SELECTED_CRAWLING_INCREMENTAL = 8
static

Definition at line 450 of file EventObjects.py.

◆ STATUS_SELECTED_PROCESSING

int dc.EventObjects.URL.STATUS_SELECTED_PROCESSING = 5
static

Definition at line 447 of file EventObjects.py.

◆ STATUS_UNDEFINED

int dc.EventObjects.URL.STATUS_UNDEFINED = 0
static

Definition at line 442 of file EventObjects.py.

◆ tagsCount

dc.EventObjects.URL.tagsCount

Definition at line 537 of file EventObjects.py.

◆ tagsMask

dc.EventObjects.URL.tagsMask

Definition at line 536 of file EventObjects.py.

◆ tcDate

dc.EventObjects.URL.tcDate

Definition at line 533 of file EventObjects.py.

◆ totalTime

dc.EventObjects.URL.totalTime

Definition at line 517 of file EventObjects.py.

◆ type

dc.EventObjects.URL.type

Definition at line 498 of file EventObjects.py.

◆ TYPE_CHAIN

int dc.EventObjects.URL.TYPE_CHAIN = 6
static

Definition at line 469 of file EventObjects.py.

◆ TYPE_FETCHED

int dc.EventObjects.URL.TYPE_FETCHED = 4
static

Definition at line 465 of file EventObjects.py.

◆ TYPE_NEW_SITE

int dc.EventObjects.URL.TYPE_NEW_SITE = 3
static

Definition at line 463 of file EventObjects.py.

◆ TYPE_REAL_TIME_CRAWLER

int dc.EventObjects.URL.TYPE_REAL_TIME_CRAWLER = 5
static

Definition at line 467 of file EventObjects.py.

◆ TYPE_REGULAR

int dc.EventObjects.URL.TYPE_REGULAR = 0
static

Definition at line 457 of file EventObjects.py.

◆ TYPE_REGULAR_EXT

int dc.EventObjects.URL.TYPE_REGULAR_EXT = 2
static

Definition at line 461 of file EventObjects.py.

◆ TYPE_SINGLE

int dc.EventObjects.URL.TYPE_SINGLE = 1
static

Definition at line 459 of file EventObjects.py.

◆ UDate

dc.EventObjects.URL.UDate

Definition at line 519 of file EventObjects.py.

◆ url

dc.EventObjects.URL.url

Definition at line 494 of file EventObjects.py.

◆ URL_NORMALIZE_MASK

dc.EventObjects.URL.URL_NORMALIZE_MASK = UrlNormalizator.NORM_DEFAULT
static

Definition at line 484 of file EventObjects.py.

◆ urlMd5

dc.EventObjects.URL.urlMd5

Definition at line 505 of file EventObjects.py.

◆ urlPut

dc.EventObjects.URL.urlPut

Definition at line 542 of file EventObjects.py.

◆ urlUpdate

dc.EventObjects.URL.urlUpdate

Definition at line 541 of file EventObjects.py.


The documentation for this class was generated from the following file: