HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_db.URLNewTask.URLNewTask Class Reference
Inheritance diagram for dc_db.URLNewTask.URLNewTask:
Collaboration diagram for dc_db.URLNewTask.URLNewTask:

Public Member Functions

def __init__ (self, keyValueStorageDir, rawDataDir, dBDataTask, siteTask=None)
 
def newSiteCreate (self, initUrl, queryCallback)
 
def fillSiteRelatedFields (self, urlObj, queryCallback)
 
def resolveSiteIdByURL (self, url, queryCallback)
 
def calcSiteIdByUrl (self, url)
 
def siteTableOperation (self, urlObj, queryCallback)
 
def process (self, urls, queryCallback)
 
def urlInsertWithGoodSietId (self, urlObj, statusInit, queryCallback)
 
def selectURL (self, urlObject, queryCallback)
 
def addURL (self, urlObject, queryCallback)
 
def attributesSet (self, attributes, queryCallback)
 
- Public Member Functions inherited from dc_db.BaseTask.BaseTask
def isSiteExist (self, siteId, queryCallback, userId=None)
 
def generateCriterionSQL (self, criterions, additionWhere=None, siteId=None)
 
def fetchByCriterions (self, criterions, queryCallback)
 
def dbLock (self, mutexName, queryCallback, sleepTime=1, mutexLockTTL=Constants.DEFAULT_LOCK_TTL)
 
def dbUnlock (self, mutexName, queryCallback)
 
def createUrlsInsertQuery (self, siteId, localKeys, localValues)
 
def copyUrlsToDcUrls (self, siteId, queryCallback)
 
def statisticLogUpdate (self, localObj, urlMd5, siteId, status, queryCallback, isInsert=False)
 
def calculateMd5FormUrl (self, url, urlType, useNormilize=False)
 

Public Attributes

 siteTask
 
 recalculator
 
 urlMd5
 
 urlUpdateTask
 

Static Public Attributes

int CODE_GOOD_INSERT = 0
 
int CODE_BAD_INSERT = 1
 
int CODE_ALREADY_EXIST = 2
 

Additional Inherited Members

- Static Public Member Functions inherited from dc_db.BaseTask.BaseTask
def readValueFromSiteProp (siteId, propName, queryCallback, urlMd5=None)
 

Detailed Description

Definition at line 24 of file URLNewTask.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_db.URLNewTask.URLNewTask.__init__ (   self,
  keyValueStorageDir,
  rawDataDir,
  dBDataTask,
  siteTask = None 
)

Definition at line 32 of file URLNewTask.py.

32  def __init__(self, keyValueStorageDir, rawDataDir, dBDataTask, siteTask=None):
33  super(URLNewTask, self).__init__()
34  self.siteTask = siteTask
35  self.recalculator = FieldRecalculator()
36  self.urlMd5 = None
37  self.urlUpdateTask = URLUpdateTask(keyValueStorageDir, rawDataDir, dBDataTask)
38 
39 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ addURL()

def dc_db.URLNewTask.URLNewTask.addURL (   self,
  urlObject,
  queryCallback 
)

Definition at line 210 of file URLNewTask.py.

210  def addURL(self, urlObject, queryCallback):
211  self.statisticLogUpdate(urlObject, self.urlMd5, urlObject.siteId, urlObject.status, queryCallback, True)
212  ret = False
213  fields, values = Constants.getFieldsValuesTuple(urlObject, Constants.URLTableDict)
214  fieldValueString = Constants.createFieldsValuesString(fields, values)
215  if fieldValueString is not None and fieldValueString != "":
216  query = Constants.INSERT_COMMON_TEMPLATE % ((Constants.DC_URLS_TABLE_NAME_TEMPLATE % urlObject.siteId),
217  fieldValueString)
218  logger.debug(str(query))
219  queryCallback(query, Constants.SECONDARY_DB_ID, Constants.EXEC_NAME, True)
220  ret = True
221 
222  return ret
223 
224 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ attributesSet()

def dc_db.URLNewTask.URLNewTask.attributesSet (   self,
  attributes,
  queryCallback 
)

Definition at line 229 of file URLNewTask.py.

229  def attributesSet(self, attributes, queryCallback):
230  logger.debug(">>> Add Attributes (len) == " + str(len(attributes)))
231  attrSetTask = AttrSetTask()
232  res = attrSetTask.process(attributes, queryCallback)
233  logger.debug(">>> Add Attributes (res) == " + varDump(res))
234 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
Here is the call graph for this function:
Here is the caller graph for this function:

◆ calcSiteIdByUrl()

def dc_db.URLNewTask.URLNewTask.calcSiteIdByUrl (   self,
  url 
)

Definition at line 84 of file URLNewTask.py.

84  def calcSiteIdByUrl(self, url):
85  canonicUrl = Utils.UrlParser.generateDomainUrl(url)
86  if canonicUrl is not None and len(canonicUrl) > 0 and canonicUrl[-1] != '/':
87  canonicUrl += '/'
88  localSiteId = hashlib.md5(canonicUrl).hexdigest()
89  return localSiteId
90 
91 
Here is the caller graph for this function:

◆ fillSiteRelatedFields()

def dc_db.URLNewTask.URLNewTask.fillSiteRelatedFields (   self,
  urlObj,
  queryCallback 
)

Definition at line 55 of file URLNewTask.py.

55  def fillSiteRelatedFields(self, urlObj, queryCallback):
56  ret = False
57  SITE_EXTRACT_SQL_QUERY = "SELECT `RequestDelay`, `HTTPTimeout`, `URLType` FROM `sites` WHERE id = '%s'"
58  res = queryCallback(SITE_EXTRACT_SQL_QUERY % urlObj.siteId, Constants.PRIMARY_DB_ID)
59  if hasattr(res, '__iter__') and len(res) > 0:
60  ret = True
61  if urlObj.requestDelay is None:
62  urlObj.requestDelay = res[0][0]
63  if urlObj.httpTimeout is None:
64  urlObj.httpTimeout = res[0][1]
65  if urlObj.type is None:
66  urlObj.type = res[0][2]
67  return ret
68 
69 
Here is the caller graph for this function:

◆ newSiteCreate()

def dc_db.URLNewTask.URLNewTask.newSiteCreate (   self,
  initUrl,
  queryCallback 
)

Definition at line 44 of file URLNewTask.py.

44  def newSiteCreate(self, initUrl, queryCallback):
45  if self.siteTask is None:
46  raise Exception(">>> URLNew.siteTask object is None!")
47  localSiteObj = dc.EventObjects.Site(initUrl)
48  self.siteTask.process(localSiteObj, queryCallback)
49 
50 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ process()

def dc_db.URLNewTask.URLNewTask.process (   self,
  urls,
  queryCallback 
)

Definition at line 134 of file URLNewTask.py.

134  def process(self, urls, queryCallback):
135  ret = GeneralResponse()
136  status = URLNewTask.CODE_BAD_INSERT
137  isRelatedSite = False
138  for url in urls:
139  isRelatedSite = False
140  if url.siteId is None and url.siteSelect != dc.EventObjects.URL.SITE_SELECT_TYPE_EXPLICIT:
141  url.siteId = self.resolveSiteIdByURL(url.url, queryCallback)
142  if self.isSiteExist(url.siteId, queryCallback):
143  isRelatedSite = self.fillSiteRelatedFields(url, queryCallback)
144  logger.debug(">>> Url New main = " + url.url)
145  try:
146  if not isRelatedSite:
147  logger.debug(">>> Url New before = " + url.url)
148  self.siteTableOperation(url, queryCallback)
149  logger.debug(">>> Site_Id By URL = %s", str(url.url))
150  logger.debug(">>> Url New after = " + url.url)
151  if url.siteId is not None and url.siteId != "":
152  status = self.urlInsertWithGoodSietId(url, status, queryCallback)
153  except Exception as excp:
154  logger.debug(">>> Url New operation exception = " + str(excp))
155  ret.statuses.append(status)
156  return ret
157 
158 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ resolveSiteIdByURL()

def dc_db.URLNewTask.URLNewTask.resolveSiteIdByURL (   self,
  url,
  queryCallback 
)

Definition at line 74 of file URLNewTask.py.

74  def resolveSiteIdByURL(self, url, queryCallback):
75  ret = None
76  query = Constants.SELECT_SITE_ID_BY_URL % url
77  res = queryCallback(query, Constants.PRIMARY_DB_ID)
78  if hasattr(res, '__iter__') and len(res) > 0:
79  ret = res[0][0]
80  logger.debug(">>> Site_Id By URL = %s", str(ret))
81  return ret
82 
83 
Here is the caller graph for this function:

◆ selectURL()

def dc_db.URLNewTask.URLNewTask.selectURL (   self,
  urlObject,
  queryCallback 
)

Definition at line 191 of file URLNewTask.py.

191  def selectURL(self, urlObject, queryCallback):
192  ret = False
193  self.urlMd5 = None
194  LOCAL_URL_CHECK_QUERY = "SELECT COUNT(*) FROM `urls_%s` WHERE `URLMd5` = '%s'"
195  if urlObject.urlMd5 is not None:
196  self.urlMd5 = urlObject.urlMd5
197  else:
198  self.urlMd5 = hashlib.md5(urlObject.url).hexdigest()
199  query = LOCAL_URL_CHECK_QUERY % (urlObject.siteId, self.urlMd5)
200  res = queryCallback(query, Constants.SECONDARY_DB_ID)
201  if hasattr(res, '__iter__') and len(res) > 0 and len(res[0]) > 0 and res[0][0] > 0:
202  ret = True
203  return ret
204 
205 
Here is the caller graph for this function:

◆ siteTableOperation()

def dc_db.URLNewTask.URLNewTask.siteTableOperation (   self,
  urlObj,
  queryCallback 
)

Definition at line 96 of file URLNewTask.py.

96  def siteTableOperation(self, urlObj, queryCallback):
97  if urlObj.siteSelect == dc.EventObjects.URL.SITE_SELECT_TYPE_EXPLICIT:
98  if urlObj.siteId == "" or not self.isSiteExist(urlObj.siteId, queryCallback):
99  urlObj.siteId = "0"
100  elif urlObj.siteSelect == dc.EventObjects.URL.SITE_SELECT_TYPE_AUTO:
101  try:
102  canonicUrl = Utils.UrlParser.generateDomainUrl(urlObj.url)
103  if canonicUrl is not None and len(canonicUrl) > 0 and canonicUrl[-1] != '/':
104  canonicUrl += '/'
105  localSiteId = self.calcSiteIdByUrl(urlObj.url)
106  logger.debug(">>> S_NEW_ID=" + str(localSiteId))
107  if self.isSiteExist(localSiteId, queryCallback):
108  urlObj.siteId = localSiteId
109  self.fillSiteRelatedFields(urlObj, queryCallback)
110  elif canonicUrl is not None:
111  self.newSiteCreate(canonicUrl, queryCallback)
112  urlObj.siteId = localSiteId
113  else:
114  raise Exception(">>> canonicUrl is None !!!")
116  logger.debug(">>> UrlParseException")
117  elif urlObj.siteSelect == dc.EventObjects.URL.SITE_SELECT_TYPE_QUALIFY_URL:
118  localSiteId = self.calcSiteIdByUrl(urlObj.url)
119  if not self.isSiteExist(localSiteId, queryCallback):
120  urlObj.siteId = "0"
121  elif urlObj.siteSelect == dc.EventObjects.URL.SITE_SELECT_TYPE_NONE:
122  localSiteId = self.calcSiteIdByUrl(urlObj.url)
123  if not self.isSiteExist(localSiteId, queryCallback):
124  Exception(">>> urlObj operation can't find siteId")
125  else:
126  raise Exception(">>> urlObj.siteSelect field has wrong value - %s" % str(urlObj.siteSelect))
127 
128 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ urlInsertWithGoodSietId()

def dc_db.URLNewTask.URLNewTask.urlInsertWithGoodSietId (   self,
  urlObj,
  statusInit,
  queryCallback 
)

Definition at line 165 of file URLNewTask.py.

165  def urlInsertWithGoodSietId(self, urlObj, statusInit, queryCallback):
166  ret = statusInit
167  if not self.selectURL(urlObj, queryCallback):
168  if self.addURL(urlObj, queryCallback):
169  ret = URLNewTask.CODE_GOOD_INSERT
170  if urlObj.attributes is not None and len(urlObj.attributes) > 0:
171  self.attributesSet(urlObj.attributes, queryCallback)
172  else:
173  ret = URLNewTask.CODE_ALREADY_EXIST
174  if urlObj.urlUpdate is not None:
175  logger.debug(">>> Url New Start Internal urlUpdate")
176  self.urlUpdateTask.process([urlObj.urlUpdate], queryCallback)
177 
178  if urlObj.attributes is not None and len(urlObj.attributes) > 0:
179  self.attributesSet(urlObj.attributes, queryCallback)
180 
181  self.recalculator.commonRecalc(urlObj.siteId, queryCallback)
182  if "urlPut" in urlObj.__dict__ and urlObj.urlPut is not None:
183  self.urlUpdateTask.urlPutOperation(urlObj, urlObj.urlPut, queryCallback)
184  return ret
185 
186 
Here is the call graph for this function:
Here is the caller graph for this function:

Member Data Documentation

◆ CODE_ALREADY_EXIST

int dc_db.URLNewTask.URLNewTask.CODE_ALREADY_EXIST = 2
static

Definition at line 28 of file URLNewTask.py.

◆ CODE_BAD_INSERT

int dc_db.URLNewTask.URLNewTask.CODE_BAD_INSERT = 1
static

Definition at line 27 of file URLNewTask.py.

◆ CODE_GOOD_INSERT

int dc_db.URLNewTask.URLNewTask.CODE_GOOD_INSERT = 0
static

Definition at line 26 of file URLNewTask.py.

◆ recalculator

dc_db.URLNewTask.URLNewTask.recalculator

Definition at line 35 of file URLNewTask.py.

◆ siteTask

dc_db.URLNewTask.URLNewTask.siteTask

Definition at line 34 of file URLNewTask.py.

◆ urlMd5

dc_db.URLNewTask.URLNewTask.urlMd5

Definition at line 36 of file URLNewTask.py.

◆ urlUpdateTask

dc_db.URLNewTask.URLNewTask.urlUpdateTask

Definition at line 37 of file URLNewTask.py.


The documentation for this class was generated from the following file: