264 def getURLContent(self, urlContentRequest, queryCallback):
265 dataDir = self.rawDataDir +
"/" + urlContentRequest.siteId +
"/" + PathMaker(urlContentRequest.urlMd5).getDir()
267 self.contentMask = urlContentRequest.contentTypeMask
269 if self.contentMask & (dc.EventObjects.URLContentRequest.CONTENT_TYPE_PROCESSED | \
270 dc.EventObjects.URLContentRequest.CONTENT_TYPE_PROCESSED_INTERNAL | \
271 dc.EventObjects.URLContentRequest.CONTENT_TYPE_PROCESSED_CUSTOM):
272 self.processedContents.extend(self.contentProcessed(dataDir, urlContentRequest, self.contentMask, queryCallback))
274 if self.contentMask & dc.EventObjects.URLContentRequest.CONTENT_TYPE_RAW:
275 if self.contentMask & dc.EventObjects.URLContentRequest.CONTENT_TYPE_RAW_LAST:
276 self.contentRawCommon(dataDir,
True,
False)
277 if self.contentMask & dc.EventObjects.URLContentRequest.CONTENT_TYPE_RAW_FIRST:
278 self.contentRawCommon(dataDir,
False,
False)
279 if self.contentMask & dc.EventObjects.URLContentRequest.CONTENT_TYPE_RAW_ALL:
280 self.contentRawCommon(dataDir,
False,
True)
282 self.fillAdditionContentTypes(dc.EventObjects.URLContentRequest.CONTENT_TYPE_TIDY,
283 dc.EventObjects.Content.CONTENT_TIDY_CONTENT, DC_CONSTANTS.RAW_DATA_TIDY_SUFF,
286 self.fillAdditionContentTypes(dc.EventObjects.URLContentRequest.CONTENT_TYPE_DYNAMIC,
287 dc.EventObjects.Content.CONTENT_DYNAMIC_CONTENT, DC_CONSTANTS.RAW_DATA_DYNAMIC_SUFF,
290 self.fillAdditionContentTypes(dc.EventObjects.URLContentRequest.CONTENT_TYPE_CHAIN,
291 dc.EventObjects.Content.CONTENT_CHAIN_PARTS, DC_CONSTANTS.RAW_DATA_CHAIN_SUFF,
294 logger.debug(
"!!!!! self.processedContents: %s", Utils.varDump(self.processedContents, stringifyType=0, ensure_ascii=
False, strTypeMaxLen=5000))
297 ret.headers = self.headers
298 ret.requests = self.requests
300 ret.cookies = self.cookies
301 row = self.selectURLFromMySQL(urlContentRequest.siteId, urlContentRequest.urlMd5, queryCallback)
305 ret.status = row[
"Status"]
309 ret.urlMd5 = row[
"URLMd5"]
310 if "RawContentMd5" in row:
311 ret.rawContentMd5 = row[
"RawContentMd5"]
312 if "ContentURLMd5" in row:
313 ret.contentURLMd5 = row[
"ContentURLMd5"]
315 ret.siteId = row[
"Site_Id"]
316 if hasattr(urlContentRequest.dbFieldsList,
'__iter__')
and len(urlContentRequest.dbFieldsList) > 0:
317 ret.dbFields = self.genDBFields(urlContentRequest.dbFieldsList, \
318 urlContentRequest.dbFieldsListDefaultValues, \
321 if self.contentMask & dc.EventObjects.URLContentRequest.CONTENT_TYPE_ATTRIBUTES:
322 if ret.urlMd5
is not None and ret.urlMd5 !=
"" and ret.siteId
is not None:
323 ret.attributes = AttrFetchTask.fetchUrlsAttributesByNames(ret.siteId,
326 urlContentRequest.attributeNames)