261 def getURLContentFromBatch(self):
262 self.urlContentResponse = []
264 for item
in self.items:
265 url = item.urlObj.url
266 if item.urlPutObj
is not None:
269 if len(item.urlObj.attributes) > 0:
270 self.logger.debug(
"item.urlPutObj.attributes: %s",
varDump(item.urlObj.attributes))
271 attributes = item.urlObj.attributes
272 except Exception, err:
273 self.logger.
error(
"load attributes failed: %s", str(err))
275 if item.urlPutObj.putDict[
"cDate"]
is not None:
276 contents = [dc_event.Content(item.urlPutObj.putDict[
"data"], item.urlPutObj.putDict[
"cDate"],
277 dc_event.Content.CONTENT_PROCESSOR_CONTENT)]
279 contents = [dc_event.Content(item.urlPutObj.putDict[
"data"],
280 typeId=dc_event.Content.CONTENT_PROCESSOR_CONTENT)]
284 isFetchRawContent = self.selectSiteProperty(item,
"FETCH_RAW_CONTENT")
285 if item.urlObj.urlPut
is not None and isFetchRawContent
is not None and int(isFetchRawContent) == 1:
286 rawContents = [dc_event.Content(item.urlObj.urlPut.putDict[
"data"], item.urlObj.urlPut.putDict[
"cDate"],
287 dc_event.Content.CONTENT_RAW_CONTENT)]
288 urlContentResponse = dc_event.URLContentResponse(url, rawContents, processedContents=contents)
289 urlContentResponse.status = 7
290 urlContentResponse.urlMd5 = item.urlObj.urlMd5
291 urlContentResponse.siteId = item.siteId
292 urlContentResponse.contentURLMd5 = item.urlObj.contentURLMd5
293 urlContentResponse.rawContentMd5 = item.urlObj.rawContentMd5
294 urlContentResponse.attributes = attributes
295 urlContentResponse.dbFields = {
"Status":item.urlObj.status,
296 "Crawled":item.urlObj.crawled,
297 "Processed":item.urlObj.processed,
298 "ContentType":item.urlObj.contentType,
299 "Charset":item.urlObj.charset,
300 "ErrorMask":item.urlObj.errorMask,
301 "CrawlingTime":item.urlObj.crawlingTime,
302 "ProcessingTime":item.urlObj.processingTime,
303 "HttpCode":item.urlObj.httpCode,
304 "Size":item.urlObj.size,
305 "LinksI":item.urlObj.linksI,
306 "LinksE":item.urlObj.linksE,
307 "RawContentMd5":item.urlObj.rawContentMd5,
308 "LastModified":item.urlObj.lastModified,
309 "CDate":item.urlObj.CDate,
310 "UDate":item.urlObj.UDate,
311 "TagsMask":item.urlObj.tagsMask,
312 "TagsCount":item.urlObj.tagsCount,
313 "PDate":item.urlObj.pDate,
314 "ContentURLMd5":item.urlObj.contentURLMd5,
315 "BatchId":item.urlObj.batchId}
317 if item.urlPutObj
is not None and "properties" in item.urlPutObj.putDict:
318 urlContentResponse.itemProperties = item.urlPutObj.putDict[
"properties"]
320 self.urlContentResponse.append(urlContentResponse)
321 self.logger.debug(
"urlContentResponse: %s",
varDump(obj=self.urlContentResponse, strTypeMaxLen=5000))
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)