HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc.EventObjects.URLUpdate Class Reference
Inheritance diagram for dc.EventObjects.URLUpdate:
Collaboration diagram for dc.EventObjects.URLUpdate:

Public Member Functions

def __init__ (self, siteId, urlString, urlType=URLStatus.URL_TYPE_URL, stateField=None, statusField=None, normalizeMask=URL.URL_NORMALIZE_MASK, urlObject=None)
 
def fillMD5 (self, urlString, urlType)
 
- Public Member Functions inherited from dc.EventObjects.URL
def __init__ (self, siteId, url, state=STATE_ENABLED, urlUpdate=None, normalizeMask=URL_NORMALIZE_MASK)
 
def getURL (self, normalizeMask=URL_NORMALIZE_MASK)
 
- Public Member Functions inherited from app.Utils.JsonSerializable
def __init__ (self)
 
def toJSON (self)
 

Public Attributes

 siteId
 
 type
 
 state
 
 status
 
 siteSelect
 
 crawled
 
 processed
 
 contentType
 
 requestDelay
 
 processingDelay
 
 httpTimeout
 
 charset
 
 batchId
 
 errorMask
 
 crawlingTime
 
 processingTime
 
 totalTime
 
 httpCode
 
 UDate
 
 CDate
 
 httpMethod
 
 size
 
 linksI
 
 linksE
 
 freq
 
 depth
 
 rawContentMd5
 
 parentMd5
 
 lastModified
 
 eTag
 
 mRate
 
 mRateCounter
 
 tcDate
 
 maxURLsFromPage
 
 priority
 
 tagsCount
 
 contentURLMd5
 
 tagsMask
 
 chainId
 
 classifierMask
 
 attributes
 
 criterions
 
 url
 
 urlMd5
 
- Public Attributes inherited from dc.EventObjects.URL
 siteId
 
 url
 
 type
 
 state
 
 status
 
 siteSelect
 
 crawled
 
 processed
 
 urlMd5
 
 contentType
 
 requestDelay
 
 processingDelay
 
 httpTimeout
 
 charset
 
 batchId
 
 errorMask
 
 crawlingTime
 
 processingTime
 
 totalTime
 
 httpCode
 
 UDate
 
 CDate
 
 httpMethod
 
 size
 
 linksI
 
 linksE
 
 freq
 
 depth
 
 rawContentMd5
 
 parentMd5
 
 lastModified
 
 eTag
 
 mRate
 
 mRateCounter
 
 tcDate
 
 maxURLsFromPage
 
 contentMask
 
 tagsMask
 
 tagsCount
 
 pDate
 
 contentURLMd5
 
 priority
 
 urlUpdate
 
 urlPut
 
 chainId
 
 classifierMask
 
 attributes
 

Additional Inherited Members

- Static Public Member Functions inherited from app.Utils.JsonSerializable
def json_serial (obj)
 
- Static Public Attributes inherited from dc.EventObjects.URL
int STATE_ENABLED = 0
 
int STATE_DISABLED = 1
 
int STATE_ERROR = 2
 
int STATUS_UNDEFINED = 0
 
int STATUS_NEW = 1
 
int STATUS_SELECTED_CRAWLING = 2
 
int STATUS_CRAWLING = 3
 
int STATUS_CRAWLED = 4
 
int STATUS_SELECTED_PROCESSING = 5
 
int STATUS_PROCESSING = 6
 
int STATUS_PROCESSED = 7
 
int STATUS_SELECTED_CRAWLING_INCREMENTAL = 8
 
int CONTENT_EMPTY = 0
 
int CONTENT_STORED_ON_DISK = 1 << 0
 
int TYPE_REGULAR = 0
 
int TYPE_SINGLE = 1
 
int TYPE_REGULAR_EXT = 2
 
int TYPE_NEW_SITE = 3
 
int TYPE_FETCHED = 4
 
int TYPE_REAL_TIME_CRAWLER = 5
 
int TYPE_CHAIN = 6
 
int SITE_SELECT_TYPE_EXPLICIT = 0
 
int SITE_SELECT_TYPE_AUTO = 1
 
int SITE_SELECT_TYPE_QUALIFY_URL = 2
 
int SITE_SELECT_TYPE_NONE = 3
 
string CONTENT_TYPE_TEXT_HTML = "text/html"
 
string CONTENT_TYPE_UNDEFINED = ""
 
 URL_NORMALIZE_MASK = UrlNormalizator.NORM_DEFAULT
 

Detailed Description

Definition at line 656 of file EventObjects.py.

Constructor & Destructor Documentation

◆ __init__()

def dc.EventObjects.URLUpdate.__init__ (   self,
  siteId,
  urlString,
  urlType = URLStatus.URL_TYPE_URL,
  stateField = None,
  statusField = None,
  normalizeMask = URL.URL_NORMALIZE_MASK,
  urlObject = None 
)

Definition at line 668 of file EventObjects.py.

668  normalizeMask=URL.URL_NORMALIZE_MASK, urlObject=None):
669  if urlObject is None or not isinstance(urlObject, URL):
670  # Init with default
671  if urlType == URLStatus.URL_TYPE_URL:
672  url = urlString
673  else:
674  url = None
675  # super(URLUpdate, self).__init__(siteId, urlString, stateField)
676  super(URLUpdate, self).__init__(siteId=siteId, url=url, state=stateField, normalizeMask=normalizeMask)
677  self.siteId = siteId
678  self.type = None
679  self.state = stateField
680  self.status = statusField
681  self.siteSelect = None
682  self.crawled = None
683  self.processed = None
684  self.fillMD5(urlString, urlType)
685  self.contentType = None
686  self.requestDelay = None
687  self.processingDelay = None
688  self.httpTimeout = None
689  self.charset = None
690  self.batchId = None
691  self.errorMask = None
692  self.crawlingTime = None
693  self.processingTime = None
694  self.totalTime = None
695  self.httpCode = None
696  self.UDate = SQLExpression("NOW()")
697  self.CDate = None
698  self.httpMethod = None
699  self.size = None
700  self.linksI = None
701  self.linksE = None
702  self.freq = None
703  self.depth = None
704  self.rawContentMd5 = None
705  self.parentMd5 = None
706  self.lastModified = None
707  self.eTag = None
708  self.mRate = None
709  self.mRateCounter = None
710  self.tcDate = None
711  self.maxURLsFromPage = None
712  self.priority = None
713  self.tagsCount = None
714  self.contentURLMd5 = None
715  self.tagsMask = None
716  self.chainId = None
717  self.classifierMask = None
718  self.attributes = None
719  # Init criterions
720  self.criterions = {}
721  self.criterions[URLFetch.CRITERION_LIMIT] = 1
722  else:
723  # Init from URL object
724  for name, value in urlObject.__dict__.items():
725  if not name.startswith("__"):
726  if hasattr(self, name) and value is not None:
727  setattr(self, name, value)
728 
729 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ fillMD5()

def dc.EventObjects.URLUpdate.fillMD5 (   self,
  urlString,
  urlType 
)

Definition at line 734 of file EventObjects.py.

734  def fillMD5(self, urlString, urlType):
735  if urlType == URLStatus.URL_TYPE_URL:
736  # Commented out because parent class doing the same
737  # self.url = urlString
738  # self.urlMd5 = hashlib.md5(urlString).hexdigest()
739  pass
740  else:
741  self.url = None
742  self.urlMd5 = urlString
743 
744 
745 
746 # #BatchItem object
747 #
748 # The BatchItem object for batch crawling tasks.
749 #

Member Data Documentation

◆ attributes

dc.EventObjects.URLUpdate.attributes

Definition at line 718 of file EventObjects.py.

◆ batchId

dc.EventObjects.URLUpdate.batchId

Definition at line 690 of file EventObjects.py.

◆ CDate

dc.EventObjects.URLUpdate.CDate

Definition at line 697 of file EventObjects.py.

◆ chainId

dc.EventObjects.URLUpdate.chainId

Definition at line 716 of file EventObjects.py.

◆ charset

dc.EventObjects.URLUpdate.charset

Definition at line 689 of file EventObjects.py.

◆ classifierMask

dc.EventObjects.URLUpdate.classifierMask

Definition at line 717 of file EventObjects.py.

◆ contentType

dc.EventObjects.URLUpdate.contentType

Definition at line 685 of file EventObjects.py.

◆ contentURLMd5

dc.EventObjects.URLUpdate.contentURLMd5

Definition at line 714 of file EventObjects.py.

◆ crawled

dc.EventObjects.URLUpdate.crawled

Definition at line 682 of file EventObjects.py.

◆ crawlingTime

dc.EventObjects.URLUpdate.crawlingTime

Definition at line 692 of file EventObjects.py.

◆ criterions

dc.EventObjects.URLUpdate.criterions

Definition at line 720 of file EventObjects.py.

◆ depth

dc.EventObjects.URLUpdate.depth

Definition at line 703 of file EventObjects.py.

◆ errorMask

dc.EventObjects.URLUpdate.errorMask

Definition at line 691 of file EventObjects.py.

◆ eTag

dc.EventObjects.URLUpdate.eTag

Definition at line 707 of file EventObjects.py.

◆ freq

dc.EventObjects.URLUpdate.freq

Definition at line 702 of file EventObjects.py.

◆ httpCode

dc.EventObjects.URLUpdate.httpCode

Definition at line 695 of file EventObjects.py.

◆ httpMethod

dc.EventObjects.URLUpdate.httpMethod

Definition at line 698 of file EventObjects.py.

◆ httpTimeout

dc.EventObjects.URLUpdate.httpTimeout

Definition at line 688 of file EventObjects.py.

◆ lastModified

dc.EventObjects.URLUpdate.lastModified

Definition at line 706 of file EventObjects.py.

◆ linksE

dc.EventObjects.URLUpdate.linksE

Definition at line 701 of file EventObjects.py.

◆ linksI

dc.EventObjects.URLUpdate.linksI

Definition at line 700 of file EventObjects.py.

◆ maxURLsFromPage

dc.EventObjects.URLUpdate.maxURLsFromPage

Definition at line 711 of file EventObjects.py.

◆ mRate

dc.EventObjects.URLUpdate.mRate

Definition at line 708 of file EventObjects.py.

◆ mRateCounter

dc.EventObjects.URLUpdate.mRateCounter

Definition at line 709 of file EventObjects.py.

◆ parentMd5

dc.EventObjects.URLUpdate.parentMd5

Definition at line 705 of file EventObjects.py.

◆ priority

dc.EventObjects.URLUpdate.priority

Definition at line 712 of file EventObjects.py.

◆ processed

dc.EventObjects.URLUpdate.processed

Definition at line 683 of file EventObjects.py.

◆ processingDelay

dc.EventObjects.URLUpdate.processingDelay

Definition at line 687 of file EventObjects.py.

◆ processingTime

dc.EventObjects.URLUpdate.processingTime

Definition at line 693 of file EventObjects.py.

◆ rawContentMd5

dc.EventObjects.URLUpdate.rawContentMd5

Definition at line 704 of file EventObjects.py.

◆ requestDelay

dc.EventObjects.URLUpdate.requestDelay

Definition at line 686 of file EventObjects.py.

◆ siteId

dc.EventObjects.URLUpdate.siteId

Definition at line 677 of file EventObjects.py.

◆ siteSelect

dc.EventObjects.URLUpdate.siteSelect

Definition at line 681 of file EventObjects.py.

◆ size

dc.EventObjects.URLUpdate.size

Definition at line 699 of file EventObjects.py.

◆ state

dc.EventObjects.URLUpdate.state

Definition at line 679 of file EventObjects.py.

◆ status

dc.EventObjects.URLUpdate.status

Definition at line 680 of file EventObjects.py.

◆ tagsCount

dc.EventObjects.URLUpdate.tagsCount

Definition at line 713 of file EventObjects.py.

◆ tagsMask

dc.EventObjects.URLUpdate.tagsMask

Definition at line 715 of file EventObjects.py.

◆ tcDate

dc.EventObjects.URLUpdate.tcDate

Definition at line 710 of file EventObjects.py.

◆ totalTime

dc.EventObjects.URLUpdate.totalTime

Definition at line 694 of file EventObjects.py.

◆ type

dc.EventObjects.URLUpdate.type

Definition at line 678 of file EventObjects.py.

◆ UDate

dc.EventObjects.URLUpdate.UDate

Definition at line 696 of file EventObjects.py.

◆ url

dc.EventObjects.URLUpdate.url

Definition at line 741 of file EventObjects.py.

◆ urlMd5

dc.EventObjects.URLUpdate.urlMd5

Definition at line 742 of file EventObjects.py.


The documentation for this class was generated from the following file: