HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc.EventObjects.Site Class Reference
Inheritance diagram for dc.EventObjects.Site:
Collaboration diagram for dc.EventObjects.Site:

Public Member Functions

def __init__ (self, url, _userId=0)
 
def rewriteFields (self, siteObj, addListFields=True)
 
- Public Member Functions inherited from app.Utils.JsonSerializable
def __init__ (self)
 
def toJSON (self)
 

Static Public Member Functions

def isInProperties (prop, keyName)
 
def getFromProperties (prop, keyName, fieldName="value")
 
- Static Public Member Functions inherited from app.Utils.JsonSerializable
def json_serial (obj)
 

Public Attributes

 id
 
 uDate
 
 tcDate
 
 cDate
 
 resources
 
 contents
 
 iterations
 
 state
 
 priority
 
 maxURLs
 
 maxResources
 
 maxErrors
 
 maxResourceSize
 
 requestDelay
 
 processingDelay
 
 httpTimeout
 
 errorMask
 
 errors
 
 size
 
 avgSpeed
 
 avgSpeedCounter
 
 urlType
 
 description
 
 urls
 
 filters
 
 properties
 
 userId
 
 recrawlPeriod
 
 recrawlDate
 
 maxURLsFromPage
 
 collectedURLs
 
 fetchType
 
 newURLs
 
 deletedURLs
 
 moveURLs
 
 tcDateProcess
 
 categoryId
 

Static Public Attributes

int STATE_ACTIVE = 1
 
int STATE_DISABLED = 2
 
int STATE_SUSPENDED = 3
 
int STATE_DELETED = 4
 
int STATE_DELETE_TASK = 5
 
int STATE_RESTART = 6
 
int STATE_CLEANED = 7
 
int STATE_CLEANUP_TASK = 8
 
int STATE_NOT_FOUND = 9
 
int FETCH_TYPE_STATIC = 1
 
int FETCH_TYPE_DYNAMIC = 2
 
int FETCH_TYPE_AUTO = 7
 
int FETCH_TYPE_EXTERNAL = 3
 
int DEFAULT_PRIORITY = 100
 
int DEFAULT_CATEGORY_ID = 0
 

Detailed Description

Definition at line 31 of file EventObjects.py.

Constructor & Destructor Documentation

◆ __init__()

def dc.EventObjects.Site.__init__ (   self,
  url,
  _userId = 0 
)

Definition at line 58 of file EventObjects.py.

58  def __init__(self, url, _userId=0):
59  super(Site, self).__init__()
60 
61  url = URL(siteId=0, url=url,
62  normalizeMask=UrlNormalizator.NORM_NONE).getURL(normalizeMask=UrlNormalizator.NORM_NONE)
63  # #@var id
64  # The site Id, calculated as md5 hash from first root url
65  self.id = hashlib.md5(url).hexdigest()
66  self.uDate = None
67  self.tcDate = None
68  self.cDate = SQLExpression("NOW()")
69  self.resources = 0
70  self.contents = 0
71  self.iterations = 0
72  # #@var state
73  # The site state
74  self.state = self.STATE_ACTIVE
75  # #@var priority
76  # The site priority, low value means low priority
77  self.priority = self.DEFAULT_PRIORITY
78  # The max URLs number that can be collected for site, 0 - means unlimited
79  self.maxURLs = 0
80  # The max resources number that can be collected for site, 0 - means unlimited
81  self.maxResources = 0
82  # The max crawling error number, 0 - means unlimited
83  self.maxErrors = 0
84  # The max resource size, byte
85  self.maxResourceSize = 0
86  # HTTP request delay, msec
87  self.requestDelay = 500
88  # Content processing delay, msec
89  self.processingDelay = 500
90  # HTTP response timeout, msec
91  self.httpTimeout = 30000
92  # Error mask bit set, see detailed specification
93  self.errorMask = 0
94  # Errors counter
95  self.errors = 0
96  # the sum of all raw content files sizes of resources crawled
97  self.size = 0
98  # AVG bytes per second (BPS) rate
99  self.avgSpeed = 0
100  # total times of claculating avg speed
101  self.avgSpeedCounter = 0
102  # URL type
103  # 0 - Regular, collect URLs and insert only for this site according filters;
104  # 1 - Single, do not collect URLs,
105  # 3 - collect URLs, create sites and insert for all
106  self.urlType = 0
107  # #@var description
108  # The site description.
109  self.description = ""
110  # #@var urls
111  # The list of urls strings used as root entry points
112  self.urls = []
113  if url is not None and len(url) > 0:
114  localUrl = SiteURL(siteId=self.id, url=url, normalizeMask=UrlNormalizator.NORM_NONE)
115  self.urls.append(localUrl)
116  # #@var filters
117  # The list of url filters object SiteFilter
118  self.filters = [SiteFilter(self.id, "(.*)")]
119  # #@var properties
120  # The dic of site properties fields
121  self.properties = [{"name": "PROCESS_CTYPES", "value": "text/html"},
122  {"name": "STORE_HTTP_REQUEST", "value": "1"},
123  {"name": "STORE_HTTP_HEADERS", "value": "1"},
124  {"name": "HTTP_HEADERS", "value": ""},
125  {"name": "HTTP_COOKIE", "value": ""}]
126  self.userId = _userId
127  # #@var recrawlPeriod
128  self.recrawlPeriod = 0
129  # #@var recrawlDate
130  self.recrawlDate = None
131  # #@var maxURLsFromPage
132  self.maxURLsFromPage = 0
133  # #@var collectedURLs
134  self.collectedURLs = 0
135  # #@var fetchType
136  self.fetchType = self.FETCH_TYPE_STATIC
137  # #@var newURLs
138  self.newURLs = 0
139  # #@var deletedURLs
140  self.deletedURLs = 0
141  # #@var moveURLs
142  self.moveURLs = True
143  self.tcDateProcess = None
144  self.categoryId = self.DEFAULT_CATEGORY_ID
145 
146 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ getFromProperties()

def dc.EventObjects.Site.getFromProperties (   prop,
  keyName,
  fieldName = "value" 
)
static

Definition at line 193 of file EventObjects.py.

193  def getFromProperties(prop, keyName, fieldName="value"):
194  ret = None
195 
196  if isinstance(prop, dict) and keyName in prop:
197  ret = prop[keyName]
198  else:
199  if isinstance(prop, list):
200  for item in prop:
201  if isinstance(item, dict) and keyName == item["name"] and fieldName in item:
202  ret = item[fieldName]
203  break
204 
205  return ret
206 
207 
208 
209 # #SiteUpdate event object
210 #
211 # The update site operation object
212 #

◆ isInProperties()

def dc.EventObjects.Site.isInProperties (   prop,
  keyName 
)
static

Definition at line 172 of file EventObjects.py.

172  def isInProperties(prop, keyName):
173  ret = False
174 
175  if isinstance(prop, dict) and keyName in prop:
176  ret = True
177  else:
178  if isinstance(prop, list):
179  for item in prop:
180  if isinstance(item, dict) and keyName == item["name"]:
181  ret = True
182  break
183 
184  return ret
185 
186 

◆ rewriteFields()

def dc.EventObjects.Site.rewriteFields (   self,
  siteObj,
  addListFields = True 
)

Definition at line 151 of file EventObjects.py.

151  def rewriteFields(self, siteObj, addListFields=True):
152  excludeFields = ["urls", "filters", "properties"]
153  for field in siteObj.__dict__:
154  if field not in excludeFields and siteObj.__dict__[field] is not None:
155  self.__dict__[field] = siteObj.__dict__[field]
156  for field in excludeFields:
157  if addListFields:
158  if self.__dict__[field] is not None and siteObj.__dict__[field] is not None:
159  self.__dict__[field] += siteObj.__dict__[field]
160  elif siteObj.__dict__[field] is not None:
161  self.__dict__[field] = []
162  self.__dict__[field] += siteObj.__dict__[field]
163  elif siteObj.__dict__[field] is not None:
164  self.__dict__[field] = siteObj.__dict__[field]
165 
166 

Member Data Documentation

◆ avgSpeed

dc.EventObjects.Site.avgSpeed

Definition at line 99 of file EventObjects.py.

◆ avgSpeedCounter

dc.EventObjects.Site.avgSpeedCounter

Definition at line 101 of file EventObjects.py.

◆ categoryId

dc.EventObjects.Site.categoryId

Definition at line 144 of file EventObjects.py.

◆ cDate

dc.EventObjects.Site.cDate

Definition at line 68 of file EventObjects.py.

◆ collectedURLs

dc.EventObjects.Site.collectedURLs

Definition at line 134 of file EventObjects.py.

◆ contents

dc.EventObjects.Site.contents

Definition at line 70 of file EventObjects.py.

◆ DEFAULT_CATEGORY_ID

int dc.EventObjects.Site.DEFAULT_CATEGORY_ID = 0
static

Definition at line 52 of file EventObjects.py.

◆ DEFAULT_PRIORITY

int dc.EventObjects.Site.DEFAULT_PRIORITY = 100
static

Definition at line 49 of file EventObjects.py.

◆ deletedURLs

dc.EventObjects.Site.deletedURLs

Definition at line 140 of file EventObjects.py.

◆ description

dc.EventObjects.Site.description

Definition at line 109 of file EventObjects.py.

◆ errorMask

dc.EventObjects.Site.errorMask

Definition at line 93 of file EventObjects.py.

◆ errors

dc.EventObjects.Site.errors

Definition at line 95 of file EventObjects.py.

◆ FETCH_TYPE_AUTO

int dc.EventObjects.Site.FETCH_TYPE_AUTO = 7
static

Definition at line 45 of file EventObjects.py.

◆ FETCH_TYPE_DYNAMIC

int dc.EventObjects.Site.FETCH_TYPE_DYNAMIC = 2
static

Definition at line 44 of file EventObjects.py.

◆ FETCH_TYPE_EXTERNAL

int dc.EventObjects.Site.FETCH_TYPE_EXTERNAL = 3
static

Definition at line 46 of file EventObjects.py.

◆ FETCH_TYPE_STATIC

int dc.EventObjects.Site.FETCH_TYPE_STATIC = 1
static

Definition at line 43 of file EventObjects.py.

◆ fetchType

dc.EventObjects.Site.fetchType

Definition at line 136 of file EventObjects.py.

◆ filters

dc.EventObjects.Site.filters

Definition at line 118 of file EventObjects.py.

◆ httpTimeout

dc.EventObjects.Site.httpTimeout

Definition at line 91 of file EventObjects.py.

◆ id

dc.EventObjects.Site.id

Definition at line 65 of file EventObjects.py.

◆ iterations

dc.EventObjects.Site.iterations

Definition at line 71 of file EventObjects.py.

◆ maxErrors

dc.EventObjects.Site.maxErrors

Definition at line 83 of file EventObjects.py.

◆ maxResources

dc.EventObjects.Site.maxResources

Definition at line 81 of file EventObjects.py.

◆ maxResourceSize

dc.EventObjects.Site.maxResourceSize

Definition at line 85 of file EventObjects.py.

◆ maxURLs

dc.EventObjects.Site.maxURLs

Definition at line 79 of file EventObjects.py.

◆ maxURLsFromPage

dc.EventObjects.Site.maxURLsFromPage

Definition at line 132 of file EventObjects.py.

◆ moveURLs

dc.EventObjects.Site.moveURLs

Definition at line 142 of file EventObjects.py.

◆ newURLs

dc.EventObjects.Site.newURLs

Definition at line 138 of file EventObjects.py.

◆ priority

dc.EventObjects.Site.priority

Definition at line 77 of file EventObjects.py.

◆ processingDelay

dc.EventObjects.Site.processingDelay

Definition at line 89 of file EventObjects.py.

◆ properties

dc.EventObjects.Site.properties

Definition at line 121 of file EventObjects.py.

◆ recrawlDate

dc.EventObjects.Site.recrawlDate

Definition at line 130 of file EventObjects.py.

◆ recrawlPeriod

dc.EventObjects.Site.recrawlPeriod

Definition at line 128 of file EventObjects.py.

◆ requestDelay

dc.EventObjects.Site.requestDelay

Definition at line 87 of file EventObjects.py.

◆ resources

dc.EventObjects.Site.resources

Definition at line 69 of file EventObjects.py.

◆ size

dc.EventObjects.Site.size

Definition at line 97 of file EventObjects.py.

◆ state

dc.EventObjects.Site.state

Definition at line 74 of file EventObjects.py.

◆ STATE_ACTIVE

int dc.EventObjects.Site.STATE_ACTIVE = 1
static

Definition at line 33 of file EventObjects.py.

◆ STATE_CLEANED

int dc.EventObjects.Site.STATE_CLEANED = 7
static

Definition at line 39 of file EventObjects.py.

◆ STATE_CLEANUP_TASK

int dc.EventObjects.Site.STATE_CLEANUP_TASK = 8
static

Definition at line 40 of file EventObjects.py.

◆ STATE_DELETE_TASK

int dc.EventObjects.Site.STATE_DELETE_TASK = 5
static

Definition at line 37 of file EventObjects.py.

◆ STATE_DELETED

int dc.EventObjects.Site.STATE_DELETED = 4
static

Definition at line 36 of file EventObjects.py.

◆ STATE_DISABLED

int dc.EventObjects.Site.STATE_DISABLED = 2
static

Definition at line 34 of file EventObjects.py.

◆ STATE_NOT_FOUND

int dc.EventObjects.Site.STATE_NOT_FOUND = 9
static

Definition at line 41 of file EventObjects.py.

◆ STATE_RESTART

int dc.EventObjects.Site.STATE_RESTART = 6
static

Definition at line 38 of file EventObjects.py.

◆ STATE_SUSPENDED

int dc.EventObjects.Site.STATE_SUSPENDED = 3
static

Definition at line 35 of file EventObjects.py.

◆ tcDate

dc.EventObjects.Site.tcDate

Definition at line 67 of file EventObjects.py.

◆ tcDateProcess

dc.EventObjects.Site.tcDateProcess

Definition at line 143 of file EventObjects.py.

◆ uDate

dc.EventObjects.Site.uDate

Definition at line 66 of file EventObjects.py.

◆ urls

dc.EventObjects.Site.urls

Definition at line 112 of file EventObjects.py.

◆ urlType

dc.EventObjects.Site.urlType

Definition at line 106 of file EventObjects.py.

◆ userId

dc.EventObjects.Site.userId

Definition at line 126 of file EventObjects.py.


The documentation for this class was generated from the following file: