HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_crawler.CollectProperties.CollectProperties Class Reference
Inheritance diagram for dc_crawler.CollectProperties.CollectProperties:
Collaboration diagram for dc_crawler.CollectProperties.CollectProperties:

Public Member Functions

def __init__ (self)
 
def checkFieldsIsNone (self, checkList)
 
def process (self, dom, internalLinks, externalLinks)
 
def collectProperties (self, dom, internalLinks, externalLinks, siteId, kvDbDir, res, urlId)
 
def collectAddtionalProp (self, kvCursor, internalLinksCount, externalLinksCount, batchItem, realUrl)
 
def prepareKvDbConnector (self, siteId, kvDbDir)
 
def checkKVTable (self, kvConnector, kvCursor)
 

Public Attributes

 siteId
 
 kvDbDir
 
 res
 
 batchItem
 
 realUrl
 
 urlProcess
 

Static Public Attributes

dictionary KV_TABLE_TEMPLATES
 
tuple KV_DB_TABLE_NAMES = ("titles", "redirects", "internal_links", "external_links")
 

Detailed Description

Definition at line 22 of file CollectProperties.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_crawler.CollectProperties.CollectProperties.__init__ (   self)

Definition at line 43 of file CollectProperties.py.

43  def __init__(self):
44  self.siteId = None
45  self.kvDbDir = None
46  self.res = None
47  self.batchItem = None
48  self.realUrl = None
49  self.urlProcess = None
50 
51 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ checkFieldsIsNone()

def dc_crawler.CollectProperties.CollectProperties.checkFieldsIsNone (   self,
  checkList 
)

Definition at line 54 of file CollectProperties.py.

54  def checkFieldsIsNone(self, checkList):
55  # for field in self.__dict__:
56  # if field in checkList and (not hasattr(self, field) or getattr(self, field) is None):
57  # raise Exception(">>> [CollectProperties] Mandatory field must be initialized, field Name = " + field)
58  for name in checkList:
59  if not hasattr(self, name) or getattr(self, name) is None:
60  raise Exception("Some mandatory field `%s` must be initialized!", name)
61 
62 
Here is the caller graph for this function:

◆ checkKVTable()

def dc_crawler.CollectProperties.CollectProperties.checkKVTable (   self,
  kvConnector,
  kvCursor 
)

Definition at line 186 of file CollectProperties.py.

186  def checkKVTable(self, kvConnector, kvCursor):
187  for table in self.KV_DB_TABLE_NAMES:
188  sql = "SELECT COUNT(*) as cnt FROM sqlite_master WHERE type='table' AND name='%s'" % table
189  kvCursor.execute(sql)
190  if kvCursor.fetchone()["cnt"] == 0:
191  logger.info("kv table %s dose not exist, createing...", table)
192  kvCursor.execute(self.KV_TABLE_TEMPLATES[table])
193  kvConnector.commit()
194 
Here is the caller graph for this function:

◆ collectAddtionalProp()

def dc_crawler.CollectProperties.CollectProperties.collectAddtionalProp (   self,
  kvCursor,
  internalLinksCount,
  externalLinksCount,
  batchItem,
  realUrl 
)

Definition at line 150 of file CollectProperties.py.

150  def collectAddtionalProp(self, kvCursor, internalLinksCount, externalLinksCount, batchItem, realUrl):
151  self.checkFieldsIsNone(["res", "urlProcess"])
152  # logger.debug("Response: %s", str(self.res))
153  size = len(self.res.str_content)
154  contentMd5 = hashlib.md5(self.res.str_content).hexdigest()
155  kvSql = "SELECT data FROM internal_links WHERE url_id <> '%s'" % (batchItem.urlId,)
156  kvCursor.execute(kvSql)
157  freq = 0
158  for row in kvCursor.fetchall():
159  urlInternalLists = row["data"]
160  if realUrl in urlInternalLists:
161  freq += 1
162 
163  self.urlProcess.siteId = batchItem.siteId
164  self.urlProcess.updateAdditionProps(internalLinksCount, externalLinksCount, batchItem, size, freq, contentMd5)
165 
166 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ collectProperties()

def dc_crawler.CollectProperties.CollectProperties.collectProperties (   self,
  dom,
  internalLinks,
  externalLinks,
  siteId,
  kvDbDir,
  res,
  urlId 
)

Definition at line 102 of file CollectProperties.py.

102  def collectProperties(self, dom, internalLinks, externalLinks, siteId, kvDbDir, res, urlId):
103  kvConnector, kvCursor = self.prepareKvDbConnector(siteId, kvDbDir)
104  self.checkKVTable(kvConnector, kvCursor)
105  title = None
106  domTitle = None
107  tmp = dom.find(".//title")
108  if tmp is not None:
109  domTitle = tmp.text
110  if isinstance(domTitle, lxml.etree._Element): # pylint: disable=E1101,W0212
111  title = domTitle.text
112  if isinstance(title, str):
113  title = title.decode('utf-8')
114 
115  histories = []
116  for history in res.redirects:
117  textHeaders = '\r\n'.join(['%s: %s' % (k, v) for k, v in history.headers.iteritems()])
118  historyItem = {"status_code": history.status_code, "headers": textHeaders}
119  histories.append(historyItem)
120  historiesData = json.dumps(histories)
121  internalLinksData = json.dumps(internalLinks)
122  externalLinksData = json.dumps(externalLinks)
123  # save title
124  kvCursor.execute('''INSERT OR REPLACE INTO titles(url_id, data) VALUES(?, ?)''', (urlId, title))
125 
126  # save redirects
127  kvCursor.execute('''INSERT OR REPLACE INTO redirects(url_id, data) VALUES(?, ?)''', (urlId, historiesData))
128 
129  # save internal links
130  kvCursor.execute('''INSERT OR REPLACE INTO internal_links(url_id, data) VALUES(?, ?)''', \
131  (urlId, internalLinksData))
132 
133  # save external links
134  kvCursor.execute('''INSERT OR REPLACE INTO external_links(url_id, data) VALUES(?, ?)''', \
135  (urlId, externalLinksData))
136 
137  kvConnector.commit()
138  return kvConnector, kvCursor
139 
140 
Definition: join.py:1
Here is the call graph for this function:
Here is the caller graph for this function:

◆ prepareKvDbConnector()

def dc_crawler.CollectProperties.CollectProperties.prepareKvDbConnector (   self,
  siteId,
  kvDbDir 
)

Definition at line 172 of file CollectProperties.py.

172  def prepareKvDbConnector(self, siteId, kvDbDir):
173  dbFile = os.path.join(kvDbDir, "%s_fields.db" % (siteId,))
174  kvConnector = sqlite3.connect(dbFile)
175  kvConnector.row_factory = sqlite3.Row
176  kvConnector.text_factory = unicode
177  kvCursor = kvConnector.cursor()
178  return kvConnector, kvCursor
179 
180 
Here is the caller graph for this function:

◆ process()

def dc_crawler.CollectProperties.CollectProperties.process (   self,
  dom,
  internalLinks,
  externalLinks 
)

Definition at line 69 of file CollectProperties.py.

69  def process(self, dom, internalLinks, externalLinks):
70  if dom is None:
71  raise Exception(">>> [CollectProperties.process] dom param must be not None")
72  if internalLinks is None:
73  raise Exception(">>> [CollectProperties.process] internalLinks param must be not None")
74  if externalLinks is None:
75  raise Exception(">>> [CollectProperties.process] externalLinks param must be not None")
76  self.checkFieldsIsNone(["siteId", "kvDbDir", "res", "batchItem", "realUrl"])
77  kvCursor = None
78  try:
79  kvConnector, kvCursor = self.collectProperties(dom, internalLinks, externalLinks, self.siteId, self.kvDbDir,
80  self.res, self.batchItem.urlId)
81  except Exception, err:
82  ExceptionLog.handler(logger, err, "collect base properties to key-value db failed", \
83  (self.siteId, self.kvDbDir, self.res, self.batchItem.urlId))
84  if kvCursor is not None and kvConnector is not None:
85  try:
86  self.collectAddtionalProp(kvCursor, len(internalLinks), len(externalLinks), self.batchItem, self.realUrl)
87  except Exception, err:
88  ExceptionLog.handler(logger, err, "collect addtional propeties to main db failed", \
89  (self.realUrl))
90  kvConnector.close()
91 
92 
Here is the call graph for this function:
Here is the caller graph for this function:

Member Data Documentation

◆ batchItem

dc_crawler.CollectProperties.CollectProperties.batchItem

Definition at line 47 of file CollectProperties.py.

◆ KV_DB_TABLE_NAMES

tuple dc_crawler.CollectProperties.CollectProperties.KV_DB_TABLE_NAMES = ("titles", "redirects", "internal_links", "external_links")
static

Definition at line 41 of file CollectProperties.py.

◆ KV_TABLE_TEMPLATES

dictionary dc_crawler.CollectProperties.CollectProperties.KV_TABLE_TEMPLATES
static
Initial value:
= {
"titles": ,
"redirects": ,
"internal_links": ,
"external_links":
}

Definition at line 24 of file CollectProperties.py.

◆ kvDbDir

dc_crawler.CollectProperties.CollectProperties.kvDbDir

Definition at line 45 of file CollectProperties.py.

◆ realUrl

dc_crawler.CollectProperties.CollectProperties.realUrl

Definition at line 48 of file CollectProperties.py.

◆ res

dc_crawler.CollectProperties.CollectProperties.res

Definition at line 46 of file CollectProperties.py.

◆ siteId

dc_crawler.CollectProperties.CollectProperties.siteId

Definition at line 44 of file CollectProperties.py.

◆ urlProcess

dc_crawler.CollectProperties.CollectProperties.urlProcess

Definition at line 49 of file CollectProperties.py.


The documentation for this class was generated from the following file: