HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
CollectProperties.py
Go to the documentation of this file.
1 """
2 @package: dc
3 @file CollectProperties.py
4 @author Scorp <developers.hce@gmail.com>
5 @link: http://hierarchical-cluster-engine.com/
6 @copyright: Copyright &copy; 2013-2014 IOIX Ukraine
7 @license: http://hierarchical-cluster-engine.com/license/
8 @since: 0.1
9 """
10 import hashlib
11 import json
12 import os
13 import sqlite3
14 import lxml
15 
16 from app.Utils import ExceptionLog
17 import app.Utils as Utils # pylint: disable=F0401
18 
19 logger = Utils.MPLogger().getLogger()
20 
21 
22 class CollectProperties(object):
23 
24  KV_TABLE_TEMPLATES = {
25  "titles": ''' CREATE TABLE titles (
26  url_id VARCHAR(32) NOT NULL PRIMARY KEY,
27  data VARCHAR(100) NOT NULL DEFAULT '')''',
28 
29  "redirects": ''' CREATE TABLE redirects (
30  url_id VARCHAR(32) NOT NULL PRIMARY KEY,
31  data VARCHAR(1000000) NOT NULL DEFAULT '') ''',
32 
33  "internal_links": ''' CREATE TABLE internal_links (
34  url_id VARCHAR(32) NOT NULL PRIMARY KEY,
35  data VARCHAR(1000000) NOT NULL DEFAULT '') ''',
36 
37  "external_links": ''' CREATE TABLE external_links (
38  url_id VARCHAR(32) NOT NULL PRIMARY KEY,
39  data VARCHAR(1000000) NOT NULL DEFAULT '') '''
40  }
41  KV_DB_TABLE_NAMES = ("titles", "redirects", "internal_links", "external_links")
42 
43  def __init__(self):
44  self.siteId = None
45  self.kvDbDir = None
46  self.res = None
47  self.batchItem = None
48  self.realUrl = None
49  self.urlProcess = None
50 
51 
52  # #checkFieldsIsNone method checks all class's mandatory fields
53  #
54  def checkFieldsIsNone(self, checkList):
55  # for field in self.__dict__:
56  # if field in checkList and (not hasattr(self, field) or getattr(self, field) is None):
57  # raise Exception(">>> [CollectProperties] Mandatory field must be initialized, field Name = " + field)
58  for name in checkList:
59  if not hasattr(self, name) or getattr(self, name) is None:
60  raise Exception("Some mandatory field `%s` must be initialized!", name)
61 
62 
63  # #process - main collectProperties processing point
64  #
65  # @param dom the - dom tree of the page
66  # @param wrapper - db-task wrapper
67  # @param internalLinks internal link list
68  # @param externalLinks external link list
69  def process(self, dom, internalLinks, externalLinks):
70  if dom is None:
71  raise Exception(">>> [CollectProperties.process] dom param must be not None")
72  if internalLinks is None:
73  raise Exception(">>> [CollectProperties.process] internalLinks param must be not None")
74  if externalLinks is None:
75  raise Exception(">>> [CollectProperties.process] externalLinks param must be not None")
76  self.checkFieldsIsNone(["siteId", "kvDbDir", "res", "batchItem", "realUrl"])
77  kvCursor = None
78  try:
79  kvConnector, kvCursor = self.collectProperties(dom, internalLinks, externalLinks, self.siteId, self.kvDbDir,
80  self.res, self.batchItem.urlId)
81  except Exception, err:
82  ExceptionLog.handler(logger, err, "collect base properties to key-value db failed", \
83  (self.siteId, self.kvDbDir, self.res, self.batchItem.urlId))
84  if kvCursor is not None and kvConnector is not None:
85  try:
86  self.collectAddtionalProp(kvCursor, len(internalLinks), len(externalLinks), self.batchItem, self.realUrl)
87  except Exception, err:
88  ExceptionLog.handler(logger, err, "collect addtional propeties to main db failed", \
89  (self.realUrl))
90  kvConnector.close()
91 
92 
93  # #collectProperties collect page properties to Key-Value DB
94  #
95  # @param dom - the dom tree of the page
96  # @param internalLinks internal link list
97  # @param externalLinks external link list
98  # @param siteId - site's id
99  # @param kvDbDir - kvdb storage directory
100  # @param res - resource object
101  # @param urlId - url's id
102  def collectProperties(self, dom, internalLinks, externalLinks, siteId, kvDbDir, res, urlId):
103  kvConnector, kvCursor = self.prepareKvDbConnector(siteId, kvDbDir)
104  self.checkKVTable(kvConnector, kvCursor)
105  title = None
106  domTitle = None
107  tmp = dom.find(".//title")
108  if tmp is not None:
109  domTitle = tmp.text
110  if isinstance(domTitle, lxml.etree._Element): # pylint: disable=E1101,W0212
111  title = domTitle.text
112  if isinstance(title, str):
113  title = title.decode('utf-8')
114 
115  histories = []
116  for history in res.redirects:
117  textHeaders = '\r\n'.join(['%s: %s' % (k, v) for k, v in history.headers.iteritems()])
118  historyItem = {"status_code": history.status_code, "headers": textHeaders}
119  histories.append(historyItem)
120  historiesData = json.dumps(histories)
121  internalLinksData = json.dumps(internalLinks)
122  externalLinksData = json.dumps(externalLinks)
123  # save title
124  kvCursor.execute('''INSERT OR REPLACE INTO titles(url_id, data) VALUES(?, ?)''', (urlId, title))
125 
126  # save redirects
127  kvCursor.execute('''INSERT OR REPLACE INTO redirects(url_id, data) VALUES(?, ?)''', (urlId, historiesData))
128 
129  # save internal links
130  kvCursor.execute('''INSERT OR REPLACE INTO internal_links(url_id, data) VALUES(?, ?)''', \
131  (urlId, internalLinksData))
132 
133  # save external links
134  kvCursor.execute('''INSERT OR REPLACE INTO external_links(url_id, data) VALUES(?, ?)''', \
135  (urlId, externalLinksData))
136 
137  kvConnector.commit()
138  return kvConnector, kvCursor
139 
140 
141  # #collectProperties collect page properties to Key-Value DB
142  #
143  # @param kvCursor - incoming kvdb Cursor
144  # @param wrapper - db-task wrapper
145  # @param internalLinks internal link list
146  # @param externalLinks external link list
147  # @param res -resource object
148  # @param batchItem - bathItem object
149  # @param realUrl - primary resource's url
150  def collectAddtionalProp(self, kvCursor, internalLinksCount, externalLinksCount, batchItem, realUrl):
151  self.checkFieldsIsNone(["res", "urlProcess"])
152  # logger.debug("Response: %s", str(self.res))
153  size = len(self.res.str_content)
154  contentMd5 = hashlib.md5(self.res.str_content).hexdigest()
155  kvSql = "SELECT data FROM internal_links WHERE url_id <> '%s'" % (batchItem.urlId,)
156  kvCursor.execute(kvSql)
157  freq = 0
158  for row in kvCursor.fetchall():
159  urlInternalLists = row["data"]
160  if realUrl in urlInternalLists:
161  freq += 1
162 
163  self.urlProcess.siteId = batchItem.siteId
164  self.urlProcess.updateAdditionProps(internalLinksCount, externalLinksCount, batchItem, size, freq, contentMd5)
165 
166 
167  # prepare sqlite DB connector and cursor object
168  #
169  # @param siteId - incoming site's Id
170  # @param kvDbDir - incoming kvDb storage directory
171  # return kvConnector and kvCursor objects
172  def prepareKvDbConnector(self, siteId, kvDbDir):
173  dbFile = os.path.join(kvDbDir, "%s_fields.db" % (siteId,))
174  kvConnector = sqlite3.connect(dbFile)
175  kvConnector.row_factory = sqlite3.Row
176  kvConnector.text_factory = unicode
177  kvCursor = kvConnector.cursor()
178  return kvConnector, kvCursor
179 
180 
181  # # checkKVTable check weather the sqlite table exists
182  # if not, then create it
183  #
184  # @param kvConnector - incoming kvdb Connector
185  # @param kvCursor - incoming kvdb Cursor
186  def checkKVTable(self, kvConnector, kvCursor):
187  for table in self.KV_DB_TABLE_NAMES:
188  sql = "SELECT COUNT(*) as cnt FROM sqlite_master WHERE type='table' AND name='%s'" % table
189  kvCursor.execute(sql)
190  if kvCursor.fetchone()["cnt"] == 0:
191  logger.info("kv table %s dose not exist, createing...", table)
192  kvCursor.execute(self.KV_TABLE_TEMPLATES[table])
193  kvConnector.commit()
def process(self, dom, internalLinks, externalLinks)
def checkKVTable(self, kvConnector, kvCursor)
def collectAddtionalProp(self, kvCursor, internalLinksCount, externalLinksCount, batchItem, realUrl)
Definition: join.py:1
def collectProperties(self, dom, internalLinks, externalLinks, siteId, kvDbDir, res, urlId)