|
def | __init__ (self) |
|
def | checkFieldsIsNone (self, checkList) |
|
def | process (self, dom, internalLinks, externalLinks) |
|
def | collectProperties (self, dom, internalLinks, externalLinks, siteId, kvDbDir, res, urlId) |
|
def | collectAddtionalProp (self, kvCursor, internalLinksCount, externalLinksCount, batchItem, realUrl) |
|
def | prepareKvDbConnector (self, siteId, kvDbDir) |
|
def | checkKVTable (self, kvConnector, kvCursor) |
|
Definition at line 22 of file CollectProperties.py.
◆ __init__()
def dc_crawler.CollectProperties.CollectProperties.__init__ |
( |
|
self | ) |
|
◆ checkFieldsIsNone()
def dc_crawler.CollectProperties.CollectProperties.checkFieldsIsNone |
( |
|
self, |
|
|
|
checkList |
|
) |
| |
Definition at line 54 of file CollectProperties.py.
54 def checkFieldsIsNone(self, checkList):
58 for name
in checkList:
59 if not hasattr(self, name)
or getattr(self, name)
is None:
60 raise Exception(
"Some mandatory field `%s` must be initialized!", name)
◆ checkKVTable()
def dc_crawler.CollectProperties.CollectProperties.checkKVTable |
( |
|
self, |
|
|
|
kvConnector, |
|
|
|
kvCursor |
|
) |
| |
Definition at line 186 of file CollectProperties.py.
186 def checkKVTable(self, kvConnector, kvCursor):
187 for table
in self.KV_DB_TABLE_NAMES:
188 sql =
"SELECT COUNT(*) as cnt FROM sqlite_master WHERE type='table' AND name='%s'" % table
189 kvCursor.execute(sql)
190 if kvCursor.fetchone()[
"cnt"] == 0:
191 logger.info(
"kv table %s dose not exist, createing...", table)
192 kvCursor.execute(self.KV_TABLE_TEMPLATES[table])
◆ collectAddtionalProp()
def dc_crawler.CollectProperties.CollectProperties.collectAddtionalProp |
( |
|
self, |
|
|
|
kvCursor, |
|
|
|
internalLinksCount, |
|
|
|
externalLinksCount, |
|
|
|
batchItem, |
|
|
|
realUrl |
|
) |
| |
Definition at line 150 of file CollectProperties.py.
150 def collectAddtionalProp(self, kvCursor, internalLinksCount, externalLinksCount, batchItem, realUrl):
151 self.checkFieldsIsNone([
"res",
"urlProcess"])
153 size = len(self.res.str_content)
154 contentMd5 = hashlib.md5(self.res.str_content).hexdigest()
155 kvSql =
"SELECT data FROM internal_links WHERE url_id <> '%s'" % (batchItem.urlId,)
156 kvCursor.execute(kvSql)
158 for row
in kvCursor.fetchall():
159 urlInternalLists = row[
"data"]
160 if realUrl
in urlInternalLists:
163 self.urlProcess.siteId = batchItem.siteId
164 self.urlProcess.updateAdditionProps(internalLinksCount, externalLinksCount, batchItem, size, freq, contentMd5)
◆ collectProperties()
def dc_crawler.CollectProperties.CollectProperties.collectProperties |
( |
|
self, |
|
|
|
dom, |
|
|
|
internalLinks, |
|
|
|
externalLinks, |
|
|
|
siteId, |
|
|
|
kvDbDir, |
|
|
|
res, |
|
|
|
urlId |
|
) |
| |
Definition at line 102 of file CollectProperties.py.
102 def collectProperties(self, dom, internalLinks, externalLinks, siteId, kvDbDir, res, urlId):
103 kvConnector, kvCursor = self.prepareKvDbConnector(siteId, kvDbDir)
104 self.checkKVTable(kvConnector, kvCursor)
107 tmp = dom.find(
".//title")
110 if isinstance(domTitle, lxml.etree._Element):
111 title = domTitle.text
112 if isinstance(title, str):
113 title = title.decode(
'utf-8')
116 for history
in res.redirects:
117 textHeaders =
'\r\n'.
join([
'%s: %s' % (k, v)
for k, v
in history.headers.iteritems()])
118 historyItem = {
"status_code": history.status_code,
"headers": textHeaders}
119 histories.append(historyItem)
120 historiesData = json.dumps(histories)
121 internalLinksData = json.dumps(internalLinks)
122 externalLinksData = json.dumps(externalLinks)
124 kvCursor.execute(
'''INSERT OR REPLACE INTO titles(url_id, data) VALUES(?, ?)''', (urlId, title))
127 kvCursor.execute(
'''INSERT OR REPLACE INTO redirects(url_id, data) VALUES(?, ?)''', (urlId, historiesData))
130 kvCursor.execute(
'''INSERT OR REPLACE INTO internal_links(url_id, data) VALUES(?, ?)''', \
131 (urlId, internalLinksData))
134 kvCursor.execute(
'''INSERT OR REPLACE INTO external_links(url_id, data) VALUES(?, ?)''', \
135 (urlId, externalLinksData))
138 return kvConnector, kvCursor
◆ prepareKvDbConnector()
def dc_crawler.CollectProperties.CollectProperties.prepareKvDbConnector |
( |
|
self, |
|
|
|
siteId, |
|
|
|
kvDbDir |
|
) |
| |
Definition at line 172 of file CollectProperties.py.
172 def prepareKvDbConnector(self, siteId, kvDbDir):
173 dbFile = os.path.join(kvDbDir,
"%s_fields.db" % (siteId,))
174 kvConnector = sqlite3.connect(dbFile)
175 kvConnector.row_factory = sqlite3.Row
176 kvConnector.text_factory = unicode
177 kvCursor = kvConnector.cursor()
178 return kvConnector, kvCursor
◆ process()
def dc_crawler.CollectProperties.CollectProperties.process |
( |
|
self, |
|
|
|
dom, |
|
|
|
internalLinks, |
|
|
|
externalLinks |
|
) |
| |
Definition at line 69 of file CollectProperties.py.
69 def process(self, dom, internalLinks, externalLinks):
71 raise Exception(
">>> [CollectProperties.process] dom param must be not None")
72 if internalLinks
is None:
73 raise Exception(
">>> [CollectProperties.process] internalLinks param must be not None")
74 if externalLinks
is None:
75 raise Exception(
">>> [CollectProperties.process] externalLinks param must be not None")
76 self.checkFieldsIsNone([
"siteId",
"kvDbDir",
"res",
"batchItem",
"realUrl"])
79 kvConnector, kvCursor = self.collectProperties(dom, internalLinks, externalLinks, self.siteId, self.kvDbDir,
80 self.res, self.batchItem.urlId)
81 except Exception, err:
82 ExceptionLog.handler(logger, err,
"collect base properties to key-value db failed", \
83 (self.siteId, self.kvDbDir, self.res, self.batchItem.urlId))
84 if kvCursor
is not None and kvConnector
is not None:
86 self.collectAddtionalProp(kvCursor, len(internalLinks), len(externalLinks), self.batchItem, self.realUrl)
87 except Exception, err:
88 ExceptionLog.handler(logger, err,
"collect addtional propeties to main db failed", \
◆ batchItem
dc_crawler.CollectProperties.CollectProperties.batchItem |
◆ KV_DB_TABLE_NAMES
tuple dc_crawler.CollectProperties.CollectProperties.KV_DB_TABLE_NAMES = ("titles", "redirects", "internal_links", "external_links") |
|
static |
◆ KV_TABLE_TEMPLATES
dictionary dc_crawler.CollectProperties.CollectProperties.KV_TABLE_TEMPLATES |
|
static |
Initial value:= {
"titles": ,
"redirects": ,
"internal_links": ,
"external_links":
}
Definition at line 24 of file CollectProperties.py.
◆ kvDbDir
dc_crawler.CollectProperties.CollectProperties.kvDbDir |
◆ realUrl
dc_crawler.CollectProperties.CollectProperties.realUrl |
◆ res
dc_crawler.CollectProperties.CollectProperties.res |
◆ siteId
dc_crawler.CollectProperties.CollectProperties.siteId |
◆ urlProcess
dc_crawler.CollectProperties.CollectProperties.urlProcess |
The documentation for this class was generated from the following file: