__init__(self, protocols=None) | dc_crawler.URLProcess.URLProcess | |
additionalUrlObjInit(urlObj, urlInitParam, conditionalData) | dc_crawler.URLProcess.URLProcess | static |
addURLFromBatchToDB(self, batchItem, crawlerType, recrawlPeriod, autoRemoveProps) | dc_crawler.URLProcess.URLProcess | |
autoRemoveURL(autoRemoveProps, recrawlPeriod, urlTable, wrapper) | dc_crawler.URLProcess.URLProcess | static |
checkDictEmptyStrings(inDict, keys) | dc_crawler.URLProcess.URLProcess | static |
checkFieldsIsNone(self, checkList) | dc_crawler.URLProcess.URLProcess | |
checkUrlByPath(self, url) | dc_crawler.URLProcess.URLProcess | |
checkUrlByProtocol(self, url) | dc_crawler.URLProcess.URLProcess | |
conditionEvaluate(condition, conditionalData) | dc_crawler.URLProcess.URLProcess | static |
createUrlObjForChain(self, pattern, urlMd5, formMethods, parentMd5, depth, detectedMime, maxURLsFromPage) | dc_crawler.URLProcess.URLProcess | |
createUrlObjForCollectURLs(self, urlMd5, formMethods, parentMd5, depth, detectedMime, maxURLsFromPage) | dc_crawler.URLProcess.URLProcess | |
dbWrapper | dc_crawler.URLProcess.URLProcess | |
DC_URLS_TABLE_PREFIX | dc_crawler.URLProcess.URLProcess | static |
DEFAULT_PROTOCOLS | dc_crawler.URLProcess.URLProcess | static |
DETECT_MIME_TIMEOUT | dc_crawler.URLProcess.URLProcess | static |
detectUrlMime(self, contentTypeMap=None, urlObj=None) | dc_crawler.URLProcess.URLProcess | |
fillRssFieldInUrlObj(self, oldUrl, objectUrlUlr, batchItem, processorName, feed, rootFeed=False) | dc_crawler.URLProcess.URLProcess | |
fillRssFieldOneElem(self, entry, urlObj, batchItem, status, crawled, localType) | dc_crawler.URLProcess.URLProcess | |
getDepthFromUrl(self, urlMd5) | dc_crawler.URLProcess.URLProcess | |
getRealUrl(self) | dc_crawler.URLProcess.URLProcess | |
isUpdateCollection | dc_crawler.URLProcess.URLProcess | |
isUrlExist(self, recrawlPeriod, urlMd5) | dc_crawler.URLProcess.URLProcess | |
normMask | dc_crawler.URLProcess.URLProcess | |
PATTERN_WITH_PROTOCOL | dc_crawler.URLProcess.URLProcess | static |
processURL(self, realUrl, internalLinks, externalLinks, filtersApply=None, siteFilters=None, baseUrl=None) | dc_crawler.URLProcess.URLProcess | |
PROTOCOL_PREFIX | dc_crawler.URLProcess.URLProcess | static |
protocolsList | dc_crawler.URLProcess.URLProcess | |
readCurrentCnt(self, maxURLs) | dc_crawler.URLProcess.URLProcess | |
recrawlUrlUpdateHandler(self, dbWrapper, recrawlUrlUpdateProperty, urlUpdateObj) | dc_crawler.URLProcess.URLProcess | |
resetErrorMask(self, batchItem) | dc_crawler.URLProcess.URLProcess | |
resolveHTTP(self, postForms, headersDict) | dc_crawler.URLProcess.URLProcess | |
resolveTableName(self, localSiteId) | dc_crawler.URLProcess.URLProcess | |
setProtocols(self, protocols=None) | dc_crawler.URLProcess.URLProcess | |
simpleURLCanonize(self, realUrl) | dc_crawler.URLProcess.URLProcess | |
site | dc_crawler.URLProcess.URLProcess | |
siteId | dc_crawler.URLProcess.URLProcess | |
siteProperties | dc_crawler.URLProcess.URLProcess | |
updateAdditionProps(self, internalLinksCount, externalLinksCount, batchItem, size, freq, contentMd5) | dc_crawler.URLProcess.URLProcess | |
updateCollectTimeAndMime(self, detectedMime, batchItem, crawledTime, autoDetectMime, httpHeaders=None, strContent=None) | dc_crawler.URLProcess.URLProcess | |
updateCrawledURL(self, crawledResource, batchItem, contentSize, status=dc.EventObjects.URL.STATUS_CRAWLED) | dc_crawler.URLProcess.URLProcess | |
updateTypeForURLObjects(self, urlObjects, typeArg=dc.EventObjects.URL.TYPE_CHAIN) | dc_crawler.URLProcess.URLProcess | |
updateURL(self, batchItem, batchId, status=dc.EventObjects.URL.STATUS_CRAWLING) | dc_crawler.URLProcess.URLProcess | |
updateURLFields(self, urlMd5, wrapper, siteId) | dc_crawler.URLProcess.URLProcess | |
updateURLForFailed(self, errorBit, batchItem, httpCode=CONSTS.HTTP_CODE_400, status=dc.EventObjects.URL.STATUS_CRAWLED, updateUdate=True) | dc_crawler.URLProcess.URLProcess | |
updateURLStatus(self, urlId, status=dc.EventObjects.URL.STATUS_CRAWLED) | dc_crawler.URLProcess.URLProcess | |
url | dc_crawler.URLProcess.URLProcess | |
URL_TEMPLATE_CONST | dc_crawler.URLProcess.URLProcess | static |
urlDBSync(self, batchItem, crawlerType, recrawlPeriod, autoRemoveProps) | dc_crawler.URLProcess.URLProcess | |
urlObj | dc_crawler.URLProcess.URLProcess | |
urlTable | dc_crawler.URLProcess.URLProcess | |
urlTemplateApply(self, url, crawlerType, urlTempalteRegular, urlTempalteRealtime, urlTempalteRegularEncode, urlTempalteRealtimeEncode) | dc_crawler.URLProcess.URLProcess | |