3 @file CollectProperties.py 4 @author Scorp <developers.hce@gmail.com> 5 @link: http://hierarchical-cluster-engine.com/ 6 @copyright: Copyright © 2013-2014 IOIX Ukraine 7 @license: http://hierarchical-cluster-engine.com/license/ 24 KV_TABLE_TEMPLATES = {
25 "titles":
''' CREATE TABLE titles ( 26 url_id VARCHAR(32) NOT NULL PRIMARY KEY, 27 data VARCHAR(100) NOT NULL DEFAULT '')''',
29 "redirects":
''' CREATE TABLE redirects ( 30 url_id VARCHAR(32) NOT NULL PRIMARY KEY, 31 data VARCHAR(1000000) NOT NULL DEFAULT '') ''',
33 "internal_links":
''' CREATE TABLE internal_links ( 34 url_id VARCHAR(32) NOT NULL PRIMARY KEY, 35 data VARCHAR(1000000) NOT NULL DEFAULT '') ''',
37 "external_links":
''' CREATE TABLE external_links ( 38 url_id VARCHAR(32) NOT NULL PRIMARY KEY, 39 data VARCHAR(1000000) NOT NULL DEFAULT '') ''' 41 KV_DB_TABLE_NAMES = (
"titles",
"redirects",
"internal_links",
"external_links")
58 for name
in checkList:
59 if not hasattr(self, name)
or getattr(self, name)
is None:
60 raise Exception(
"Some mandatory field `%s` must be initialized!", name)
69 def process(self, dom, internalLinks, externalLinks):
71 raise Exception(
">>> [CollectProperties.process] dom param must be not None")
72 if internalLinks
is None:
73 raise Exception(
">>> [CollectProperties.process] internalLinks param must be not None")
74 if externalLinks
is None:
75 raise Exception(
">>> [CollectProperties.process] externalLinks param must be not None")
81 except Exception, err:
82 ExceptionLog.handler(logger, err,
"collect base properties to key-value db failed", \
84 if kvCursor
is not None and kvConnector
is not None:
87 except Exception, err:
88 ExceptionLog.handler(logger, err,
"collect addtional propeties to main db failed", \
107 tmp = dom.find(
".//title")
110 if isinstance(domTitle, lxml.etree._Element):
111 title = domTitle.text
112 if isinstance(title, str):
113 title = title.decode(
'utf-8')
116 for history
in res.redirects:
117 textHeaders =
'\r\n'.
join([
'%s: %s' % (k, v)
for k, v
in history.headers.iteritems()])
118 historyItem = {
"status_code": history.status_code,
"headers": textHeaders}
119 histories.append(historyItem)
120 historiesData = json.dumps(histories)
121 internalLinksData = json.dumps(internalLinks)
122 externalLinksData = json.dumps(externalLinks)
124 kvCursor.execute(
'''INSERT OR REPLACE INTO titles(url_id, data) VALUES(?, ?)''', (urlId, title))
127 kvCursor.execute(
'''INSERT OR REPLACE INTO redirects(url_id, data) VALUES(?, ?)''', (urlId, historiesData))
130 kvCursor.execute(
'''INSERT OR REPLACE INTO internal_links(url_id, data) VALUES(?, ?)''', \
131 (urlId, internalLinksData))
134 kvCursor.execute(
'''INSERT OR REPLACE INTO external_links(url_id, data) VALUES(?, ?)''', \
135 (urlId, externalLinksData))
138 return kvConnector, kvCursor
153 size = len(self.
res.str_content)
154 contentMd5 = hashlib.md5(self.
res.str_content).hexdigest()
155 kvSql =
"SELECT data FROM internal_links WHERE url_id <> '%s'" % (batchItem.urlId,)
156 kvCursor.execute(kvSql)
158 for row
in kvCursor.fetchall():
159 urlInternalLists = row[
"data"]
160 if realUrl
in urlInternalLists:
164 self.
urlProcess.updateAdditionProps(internalLinksCount, externalLinksCount, batchItem, size, freq, contentMd5)
173 dbFile = os.path.join(kvDbDir,
"%s_fields.db" % (siteId,))
174 kvConnector = sqlite3.connect(dbFile)
175 kvConnector.row_factory = sqlite3.Row
176 kvConnector.text_factory = unicode
177 kvCursor = kvConnector.cursor()
178 return kvConnector, kvCursor
188 sql =
"SELECT COUNT(*) as cnt FROM sqlite_master WHERE type='table' AND name='%s'" % table
189 kvCursor.execute(sql)
190 if kvCursor.fetchone()[
"cnt"] == 0:
191 logger.info(
"kv table %s dose not exist, createing...", table)
def process(self, dom, internalLinks, externalLinks)
def checkKVTable(self, kvConnector, kvCursor)
def collectAddtionalProp(self, kvCursor, internalLinksCount, externalLinksCount, batchItem, realUrl)
def prepareKvDbConnector(self, siteId, kvDbDir)
def checkFieldsIsNone(self, checkList)
dictionary KV_TABLE_TEMPLATES
def collectProperties(self, dom, internalLinks, externalLinks, siteId, kvDbDir, res, urlId)