2 HCE project, Python bindings, Distributed Crawler application. 3 Event objects definitions. 6 @author bgv bgv.hce@gmail.com 7 @link: http://hierarchical-cluster-engine.com/ 8 @copyright: Copyright © 2013-2014 IOIX Ukraine 9 @license: http://hierarchical-cluster-engine.com/license/ 25 NOT_DELAYED_OPERATION = 1
40 STATE_CLEANUP_TASK = 8
44 FETCH_TYPE_DYNAMIC = 2
46 FETCH_TYPE_EXTERNAL = 3
49 DEFAULT_PRIORITY = 100
52 DEFAULT_CATEGORY_ID = 0
61 url =
URL(siteId=0, url=url,
62 normalizeMask=UrlNormalizator.NORM_NONE).getURL(normalizeMask=UrlNormalizator.NORM_NONE)
65 self.
id = hashlib.md5(url).hexdigest()
113 if url
is not None and len(url) > 0:
114 localUrl =
SiteURL(siteId=self.
id, url=url, normalizeMask=UrlNormalizator.NORM_NONE)
115 self.
urls.append(localUrl)
121 self.
properties = [{
"name":
"PROCESS_CTYPES",
"value":
"text/html"},
122 {
"name":
"STORE_HTTP_REQUEST",
"value":
"1"},
123 {
"name":
"STORE_HTTP_HEADERS",
"value":
"1"},
124 {
"name":
"HTTP_HEADERS",
"value":
""},
125 {
"name":
"HTTP_COOKIE",
"value":
""}]
152 excludeFields = [
"urls",
"filters",
"properties"]
153 for field
in siteObj.__dict__:
154 if field
not in excludeFields
and siteObj.__dict__[field]
is not None:
155 self.__dict__[field] = siteObj.__dict__[field]
156 for field
in excludeFields:
158 if self.__dict__[field]
is not None and siteObj.__dict__[field]
is not None:
159 self.__dict__[field] += siteObj.__dict__[field]
160 elif siteObj.__dict__[field]
is not None:
161 self.__dict__[field] = []
162 self.__dict__[field] += siteObj.__dict__[field]
163 elif siteObj.__dict__[field]
is not None:
164 self.__dict__[field] = siteObj.__dict__[field]
175 if isinstance(prop, dict)
and keyName
in prop:
178 if isinstance(prop, list):
180 if isinstance(item, dict)
and keyName == item[
"name"]:
196 if isinstance(prop, dict)
and keyName
in prop:
199 if isinstance(prop, list):
201 if isinstance(item, dict)
and keyName == item[
"name"]
and fieldName
in item:
202 ret = item[fieldName]
215 UPDATE_TYPE_APPEND = 0
216 UPDATE_TYPE_OVERWRITE = 1
217 UPDATE_TYPE_UPDATE = 2
223 def __init__(self, siteId, updateType=UPDATE_TYPE_APPEND):
224 super(SiteUpdate, self).
__init__(
"")
272 MAX_NUMBER_DEFAULT = 10
274 CRITERION_LIMIT =
"LIMIT" 275 CRITERION_WHERE =
"WHERE" 276 CRITERION_ORDER =
"ORDER BY" 277 CRITERION_TABLES =
"TABLES" 278 DEFAULT_ORDER_BY_CDATE =
"CDate DESC" 294 if criterions
is None:
349 def __init__(self, siteId=None, taskType=TASK_TYPE_SYNC, criterions=None):
354 if criterions
is not None:
358 if self.
id is not None and URLFetch.CRITERION_WHERE
not in self.
criterions:
359 self.
criterions[URLFetch.CRITERION_WHERE] =
"`Site_Id=`" + str(self.
id)
372 HISTORY_CLEANUP_NOT = 0
373 HISTORY_CLEANUP_LOG = 1
374 HISTORY_CLEANUP_FULL = 2
381 def __init__(self, siteId, taskType=TASK_TYPE_SYNC):
413 def __init__(self, siteId, pattern, ptype=TYPE_INCLUDE, pmode=TYPE_URL, pstate=TYPE_ENABLED):
444 STATUS_SELECTED_CRAWLING = 2
447 STATUS_SELECTED_PROCESSING = 5
448 STATUS_PROCESSING = 6
450 STATUS_SELECTED_CRAWLING_INCREMENTAL = 8
454 CONTENT_STORED_ON_DISK = 1 << 0
467 TYPE_REAL_TIME_CRAWLER = 5
472 SITE_SELECT_TYPE_EXPLICIT = 0
475 SITE_SELECT_TYPE_AUTO = 1
478 SITE_SELECT_TYPE_QUALIFY_URL = 2
479 SITE_SELECT_TYPE_NONE = 3
481 CONTENT_TYPE_TEXT_HTML =
"text/html" 482 CONTENT_TYPE_UNDEFINED =
"" 484 URL_NORMALIZE_MASK = UrlNormalizator.NORM_DEFAULT
490 def __init__(self, siteId, url, state=STATE_ENABLED, urlUpdate=None, normalizeMask=URL_NORMALIZE_MASK):
552 def getURL(self, normalizeMask=URL_NORMALIZE_MASK):
554 if normalizeMask != UrlNormalizator.NORM_NONE:
555 url = UrlNormalizator.normalize(self.
url,
None, normalizeMask)
567 def __init__(self, siteId, url, stateField=None, normalizeMask=URL.URL_NORMALIZE_MASK):
568 super(SiteURL, self).
__init__(siteId, url, stateField, normalizeMask=normalizeMask)
602 DEFAULT_ALGORITHM = 0
603 PROPORTIONAL_ALGORITHM = 1
606 DEFAULT_ORDER_BY_SITES =
"Priority DESC, TcDate ASC" 607 DEFAULT_ORDER_BY_URLS =
"CDate ASC" 609 CRITERION_LIMIT =
"LIMIT" 610 CRITERION_WHERE =
"WHERE" 611 CRITERION_ORDER =
"ORDER BY" 612 CRITERION_SQL =
"SQL" 623 def __init__(self, sitesList=None, urlsCriterions=None, sitesCriterions=None, urlUpdate=None, siteUpdate=None):
626 if sitesList
is None:
630 if sitesCriterions
is None:
636 if urlsCriterions
is None:
667 def __init__(self, siteId, urlString, urlType=URLStatus.URL_TYPE_URL, stateField=None, statusField=None,
668 normalizeMask=URL.URL_NORMALIZE_MASK, urlObject=None):
669 if urlObject
is None or not isinstance(urlObject, URL):
671 if urlType == URLStatus.URL_TYPE_URL:
676 super(URLUpdate, self).
__init__(siteId=siteId, url=url, state=stateField, normalizeMask=normalizeMask)
684 self.
fillMD5(urlString, urlType)
724 for name, value
in urlObject.__dict__.items():
725 if not name.startswith(
"__"):
726 if hasattr(self, name)
and value
is not None:
727 setattr(self, name, value)
735 if urlType == URLStatus.URL_TYPE_URL:
760 def __init__(self, siteId, urlId, urlObj, urlPutObj=None, urlContentResponse=None, siteObj=None, depth=0):
788 OPERATION_TYPE_NAME =
"type" 789 TYPE_NORMAL_CRAWLER = 1
790 TYPE_INCR_CRAWLER = 2
792 TYPE_REAL_TIME_CRAWLER = 4
807 def __init__(self, batchId, batchItems=None, crawlerType=None, dbMode=DB_MODE_RW, maxIterations=1, maxItems=None):
810 if crawlerType
is None:
811 crawlerType = Batch.TYPE_NORMAL_CRAWLER
813 if batchItems
is None:
816 self.
items = batchItems
832 REASON_USER_REQUEST = 0
834 REASON_SITE_LIMITS = 2
835 REASON_SELECT_TO_CRAWL_TTL = 3
836 REASON_SELECT_TO_PROCESS_TTL = 4
838 REASON_CRAWLER_AUTOREMOVE = 6
839 REASON_SITE_UPDATE_ROOT_URLS = 7
840 REASON_RT_FINALIZER = 10
841 REASON_PROCESSOR_DUPLICATE = 11
849 def __init__(self, siteId, urlString, urlType=URLStatus.URL_TYPE_URL, criterions=None, reason=REASON_USER_REQUEST):
873 def __init__(self, siteId, urlString, urlType=URLStatus.URL_TYPE_URL, stateField=None, statusField=None,
892 CONTENT_TYPE_PROCESSED = 1
893 CONTENT_TYPE_RAW_LAST = 2
894 CONTENT_TYPE_RAW_FIRST = 4
895 CONTENT_TYPE_RAW_ALL = 8
896 CONTENT_TYPE_HEADERS = 16
897 CONTENT_TYPE_REQUESTS = 32
898 CONTENT_TYPE_META = 64
899 CONTENT_TYPE_COOKIES = 128
900 CONTENT_TYPE_TIDY = 256
901 CONTENT_TYPE_DYNAMIC = 512
902 CONTENT_TYPE_RAW = 1024
903 CONTENT_TYPE_CHAIN = 2048
904 CONTENT_TYPE_PROCESSED_INTERNAL = 4096
905 CONTENT_TYPE_PROCESSED_CUSTOM = 8192
906 CONTENT_TYPE_PROCESSED_ALL = 16384
907 CONTENT_TYPE_ATTRIBUTES = 32768
918 def __init__(self, siteId, urlString, contentTypeMask=CONTENT_TYPE_PROCESSED + CONTENT_TYPE_RAW_LAST,
919 urlType=URL_TYPE_STRING):
920 super(URLContentRequest, self).
__init__()
930 self.
dbFieldsList = [
"Status",
"Crawled",
"Processed",
"ContentType",
"Charset",
"ErrorMask",
"CrawlingTime",
931 "ProcessingTime",
"HTTPCode",
"Size",
"LinksI",
"LinksE",
"RawContentMd5",
"LastModified",
932 "CDate",
"UDate",
"TagsMask",
"TagsCount",
"PDate",
"ContentURLMd5",
"Batch_Id"]
948 "CDate":int(time.time()),
961 return hashlib.md5(urlString).hexdigest()
971 CONTENT_RAW_CONTENT = 0
972 CONTENT_TIDY_CONTENT = 1
973 CONTENT_HEADERS_CONTENT = 2
974 CONTENT_REQUESTS_CONTENT = 3
975 CONTENT_META_CONTENT = 4
976 CONTENT_COOKIES_CONTENT = 5
977 CONTENT_DYNAMIC_CONTENT = 9
978 CONTENT_PROCESSOR_CONTENT = 10
979 CONTENT_CHAIN_PARTS = 11
986 def __init__(self, contentBuffer, cDate=0, typeId=CONTENT_RAW_CONTENT):
1004 STATUS_URL_NOT_FOUND = 1
1005 STATUS_RAW_CONTENT_NOT_FOUND = 2
1006 STATUS_PROCESSED_CONTENT_NOT_FOUND = 3
1021 def __init__(self, url, rawContents=None, processedContents=None, status=STATUS_OK):
1022 super(URLContentResponse, self).
__init__()
1029 if rawContents
is not None:
1033 if processedContents
is not None:
1058 STATUS_ERROR_NONE = 1
1059 STATUS_ERROR_EMPTY_LIST = 2
1067 def __init__(self, itemsList=None, errorCode=STATUS_OK, errorMessage=""):
1068 super(ClientResponse, self).
__init__()
1070 if itemsList
is None:
1082 STATUS_ERROR_RESTORE_OBJECT = 1
1083 STATUS_ERROR_DRCE = 2
1084 MSG_ERROR_RESTORE_OBJECT =
"Object restore error!" 1085 MSG_ERROR_RESTORE_OBJECT =
"DRCE error!" 1092 super(ClientResponseItem, self).
__init__()
1106 CRITERION_LIMIT =
"LIMIT" 1107 CRITERION_WHERE =
"WHERE" 1108 CRITERION_ORDER =
"ORDER BY" 1110 MAX_URLS_TO_DELETE_FROM_SITE = 100
1119 def __init__(self, siteId, urlString, urlType=URLStatus.URL_TYPE_URL, criterions=None):
1124 if criterions
is None:
1134 PARTITION_RECALC = 1
1137 def __init__(self, siteId, recalcType=FULL_RECALC, criterions=None):
1138 super(FieldRecalculatorObj, self).
__init__()
1148 def __init__(self, siteId, urlString, dbName, urlType=URLStatus.URL_TYPE_URL, criterions=None):
1164 CRITERION_LIMIT =
"LIMIT" 1165 CRITERION_WHERE =
"WHERE" 1166 CRITERION_ORDER =
"ORDER BY" 1168 MAX_URLS_TO_DELETE_FROM_SITE = 100
1169 MAX_SITES_TO_SELECT = 10
1175 def __init__(self, urlsCriterions=None, sitesCriterions=None):
1177 if urlsCriterions
is None:
1180 if sitesCriterions
is None:
1198 super(DataFetchRequest, self).
__init__()
1214 def __init__(self, resultDict, errCode=0, errMessage=""):
1215 super(DataFetchResponse, self).
__init__()
1232 super(DataDeleteRequest, self).
__init__()
1248 super(DataDeleteResponse, self).
__init__()
1264 super(DataCreateRequest, self).
__init__()
1280 super(DataCreateResponse, self).
__init__()
1296 def __init__(self, siteId, urlMd5, contentType, putDict=None, criterions=None, fileStorageSuffix=None):
1317 def __init__(self, contentType, errCode=0, errMessage=""):
1318 super(URLPutResponse, self).
__init__()
1331 CRITERION_LIMIT =
"LIMIT" 1332 CRITERION_WHERE =
"WHERE" 1333 CRITERION_ORDER =
"ORDER BY" 1334 DEFAULT_ORDER =
"ODate ASC" 1335 DEFAULT_WHERE =
"URLMD5='%URL%'" 1344 def __init__(self, siteId, urlMd5=None, urlCriterions=None, logCriterions=None):
1345 super(URLHistoryRequest, self).
__init__()
1348 if urlCriterions
is None:
1354 if logCriterions
is None:
1374 super(URLHistoryResponse, self).
__init__()
1389 CRITERION_LIMIT =
"LIMIT" 1390 CRITERION_WHERE =
"WHERE" 1391 CRITERION_ORDER =
"ORDER BY" 1392 DEFAULT_ORDER =
"ODate ASC" 1393 DEFAULT_WHERE =
"URLMD5='%URL%'" 1402 def __init__(self, siteId, urlMd5=None, urlCriterions=None, statsCriterions=None):
1403 super(URLStatsRequest, self).
__init__()
1406 if urlCriterions
is None:
1412 if statsCriterions
is None:
1429 super(URLStatsResponse, self).
__init__()
1431 if freqRows
is None:
1477 super(ProxyUpdate, self).
__init__(siteId, host)
1509 def __init__(self, siteId=None, host=None, criterions=None):
1510 super(ProxyDelete, self).
__init__()
1513 if criterions
is not None:
1517 if self.
siteId is not None and URLFetch.CRITERION_WHERE
not in self.
criterions:
1518 self.
criterions[URLFetch.CRITERION_WHERE] =
"`Site_Id=`" + str(self.
siteId)
1527 def __init__(self, siteId=None, host=None, criterions=None):
1528 super(ProxyStatus, self).
__init__()
1531 if criterions
is not None:
1535 if self.
siteId is not None and URLFetch.CRITERION_WHERE
not in self.
criterions:
1536 self.
criterions[URLFetch.CRITERION_WHERE] =
"`Site_Id=`" + str(self.
siteId)
1546 def __init__(self, siteId=None, criterions=None, siteCriterions=None):
1549 if criterions
is not None:
1553 if siteCriterions
is not None:
1557 if self.
siteId is not None and URLFetch.CRITERION_WHERE
not in self.
criterions:
1558 self.
criterions[URLFetch.CRITERION_WHERE] =
"`Site_Id=`" + str(self.
siteId)
1573 def __init__(self, siteId, name, urlMd5='', value='', cDate=None):
1594 super(AttributeUpdate, self).
__init__(siteId, name)
1613 def __init__(self, siteId, name=None, criterions=None):
1614 super(AttributeDelete, self).
__init__()
1617 if criterions
is not None:
1635 def __init__(self, siteId, name=None, criterions=None):
1636 super(AttributeFetch, self).
__init__()
1639 if criterions
is not None:
def getFromProperties(prop, keyName, fieldName="value")
def __init__(self, siteId, urlMd5=None, urlCriterions=None, statsCriterions=None)
string CONTENT_TYPE_UNDEFINED
def __init__(self, url, criterions=None)
def __init__(self, siteId, urlMd5, filesSuffix)
int MAX_URLS_TO_DELETE_FROM_SITE
def __init__(self, siteId, urlMd5, contentType, putDict=None, criterions=None, fileStorageSuffix=None)
def __init__(self, contentBuffer, cDate=0, typeId=CONTENT_RAW_CONTENT)
def __init__(self, siteId, name, urlMd5='', value='', cDate=None)
def __init__(self, siteId, urlString, contentTypeMask=CONTENT_TYPE_PROCESSED+CONTENT_TYPE_RAW_LAST, urlType=URL_TYPE_STRING)
string DEFAULT_ORDER_BY_URLS
def __init__(self, batchId, batchItems=None, crawlerType=None, dbMode=DB_MODE_RW, maxIterations=1, maxItems=None)
def __init__(self, siteId, name=None, criterions=None)
def __init__(self, siteId=None, criterions=None, siteCriterions=None)
def __init__(self, url, _userId=0)
def __init__(self, errCode=0, errMessage="")
def __init__(self, urlsCriterions=None, sitesCriterions=None)
def __init__(self, resultDict, errCode=0, errMessage="")
def __init__(self, siteId, name=None, criterions=None)
def __init__(self, siteId, urlId, urlObj, urlPutObj=None, urlContentResponse=None, siteObj=None, depth=0)
def __init__(self, siteId, urlString, urlType=URLStatus.URL_TYPE_URL, stateField=None, statusField=None, criterions=None)
def __init__(self, siteId, updateType=UPDATE_TYPE_APPEND)
int SITE_SELECT_TYPE_NONE
def __init__(self, siteId=None, taskType=TASK_TYPE_SYNC, criterions=None)
def __init__(self, siteId, taskType=TASK_TYPE_SYNC)
def __init__(self, siteId, urlString, urlType=URLStatus.URL_TYPE_URL, criterions=None)
def __init__(self, siteId, urlMd5, filesSuffix)
def isInProperties(prop, keyName)
def fillMD5(self, urlString, urlType)
def __init__(self, siteId, url, state=STATE_ENABLED, urlUpdate=None, normalizeMask=URL_NORMALIZE_MASK)
def __init__(self, siteId, urlMd5, criterions=None)
def __init__(self, siteId, recalcType=FULL_RECALC, criterions=None)
def __init__(self, url, rawContents=None, processedContents=None, status=STATUS_OK)
def __init__(self, siteId, deleteTaskId=None)
def __init__(self, siteId, pattern, ptype=TYPE_INCLUDE, pmode=TYPE_URL, pstate=TYPE_ENABLED)
def __init__(self, siteId, host)
def __init__(self, itemObject)
def fillMD5(self, urlString)
def __init__(self, siteId, urlString)
dbFieldsListDefaultValues
def __init__(self, logRows=None, siteId=None)
string DEFAULT_ORDER_BY_SITES
def __init__(self, siteId, urlString, dbName, urlType=URLStatus.URL_TYPE_URL, criterions=None)
string DEFAULT_ORDER_BY_CDATE
def __init__(self, siteId=None, host=None, criterions=None)
def __init__(self, contentType, errCode=0, errMessage="")
int MAX_URLS_TO_DELETE_FROM_SITE
def __init__(self, sitesList=None, urlsCriterions=None, sitesCriterions=None, urlUpdate=None, siteUpdate=None)
def __init__(self, siteId=None, host=None, criterions=None)
def __init__(self, itemsList=None, errorCode=STATUS_OK, errorMessage="")
def __init__(self, siteId, host)
def __init__(self, freqRows=None, siteId=None)
def getURL(self, normalizeMask=URL_NORMALIZE_MASK)
def rewriteFields(self, siteObj, addListFields=True)
def __init__(self, siteId, url, stateField=None, normalizeMask=URL.URL_NORMALIZE_MASK)
def __init__(self, siteId, name)
def __init__(self, siteId, urlString, urlType=URLStatus.URL_TYPE_URL, criterions=None, reason=REASON_USER_REQUEST)
def __init__(self, errCode=0, errMessage="")
def __init__(self, siteId, urlString, urlType=URLStatus.URL_TYPE_URL, stateField=None, statusField=None, normalizeMask=URL.URL_NORMALIZE_MASK, urlObject=None)
def __init__(self, siteId, urlMd5=None, urlCriterions=None, logCriterions=None)