HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_crawler.DetectModified.DetectModified Class Reference
Inheritance diagram for dc_crawler.DetectModified.DetectModified:
Collaboration diagram for dc_crawler.DetectModified.DetectModified:

Public Member Functions

def __init__ (self, modifiedSettingsStr)
 
def generateETagsString (self, eTags)
 
def headersClearing (self, httpParams)
 
def makeHTTPRequest (self, fetchType, httpParams)
 
def resourceComparing (self, res)
 
def getBehaviour (self)
 
def isNotModified (self)
 
def notModifiedStateProcessing (self, siteId, url, dbWrapper, defaultStatus=dc_event.URL.STATUS_CRAWLED, defaultUpdateUDate=True)
 

Public Attributes

 modifiedSettings
 
 lastModified
 
 eTags
 
 isResourceNotChanged
 
 prevContentLen
 
 prevContentMd5
 
 prevContentDate
 

Static Public Attributes

int MODE_DISABLED = 0
 
int MODE_ONE_REQUEST = 1
 
int MODE_TWO_REQUESTS = 2
 
int ALGO_AUTO = 0
 
int ALGO_IF_NOT_MATCH = 1
 
int ALGO_IF_MOFIFIED_SINCE = 2
 
int ALGO_HEAD = 3
 
int COMPARE_AUTO = 0
 
int COMPARE_DATE = 1
 
int COMPARE_CRC32 = 2
 
int COMPARE_RAW_CONTENT = 3
 
int BEHAVIOR_DEFAULT = 0
 
int BEHAVIOR_CRAWLED_STATUS = 1
 
int BEHAVIOR_PROCESSED_STATUS = 2
 
int BEHAVIOR_SAVE_UDATE = 3
 

Detailed Description

Definition at line 36 of file DetectModified.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_crawler.DetectModified.DetectModified.__init__ (   self,
  modifiedSettingsStr 
)

Definition at line 61 of file DetectModified.py.

61  def __init__(self, modifiedSettingsStr):
62  try:
63  self.modifiedSettings = json.loads(modifiedSettingsStr)
64  except Exception:
65  self.modifiedSettings = None
66  self.lastModified = None
67  self.eTags = None
68  self.isResourceNotChanged = False
69  self.prevContentLen = None
70  self.prevContentMd5 = None
71  self.prevContentDate = None
72 
73 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ generateETagsString()

def dc_crawler.DetectModified.DetectModified.generateETagsString (   self,
  eTags 
)

Definition at line 78 of file DetectModified.py.

78  def generateETagsString(self, eTags):
79  ret = ""
80  if eTags is not None:
81  if isinstance(eTags, list):
82  # for eTag in eTags:
83  # ret += '"'
84  # ret += str(eTag)
85  # ret += '",'
86  # ret = ret.strip(',')
87  ret = '","'.join(eTags)
88  elif isinstance(eTags, basestring):
89  ret = eTags
90  ret = '"' + ret + '"'
91  return ret
92 
93 
Definition: join.py:1
Here is the caller graph for this function:

◆ getBehaviour()

def dc_crawler.DetectModified.DetectModified.getBehaviour (   self)

Definition at line 200 of file DetectModified.py.

200  def getBehaviour(self):
201  if self.modifiedSettings is not None:
202  return self.modifiedSettings["behavior"]
203  else:
204  return None
205 
206 
Here is the caller graph for this function:

◆ headersClearing()

def dc_crawler.DetectModified.DetectModified.headersClearing (   self,
  httpParams 
)

Definition at line 94 of file DetectModified.py.

94  def headersClearing(self, httpParams):
95  dellKeys = []
96  for key in httpParams["httpHeader"]:
97  if key.lower() == "if-none-match" or key.lower() == "if-modified-since":
98  dellKeys.append(key)
99  for elem in dellKeys:
100  del httpParams["httpHeader"][elem]
101 
102 
Here is the caller graph for this function:

◆ isNotModified()

def dc_crawler.DetectModified.DetectModified.isNotModified (   self)

Definition at line 210 of file DetectModified.py.

210  def isNotModified(self):
211  # variable for result
212  ret = False
213  logger.debug("!!! isNotModified() enter ... self.isResourceNotChanged = " + str(bool(self.isResourceNotChanged)))
214 
215  if self.isResourceNotChanged and self.modifiedSettings is not None and \
216  self.modifiedSettings["behavior"] in [self.BEHAVIOR_DEFAULT, \
217  self.BEHAVIOR_CRAWLED_STATUS, \
218  self.BEHAVIOR_PROCESSED_STATUS, \
219  self.BEHAVIOR_SAVE_UDATE]:
220  ret = True
221 
222  logger.debug("!!! isNotModified() leave ... ret = " + str(bool(ret)))
223  return ret
224 
225 
Here is the call graph for this function:

◆ makeHTTPRequest()

def dc_crawler.DetectModified.DetectModified.makeHTTPRequest (   self,
  fetchType,
  httpParams 
)

Definition at line 103 of file DetectModified.py.

103  def makeHTTPRequest(self, fetchType, httpParams):
104  ret = None
105  self.isResourceNotChanged = False
106  if self.modifiedSettings is not None:
107  # logger.debug(">>> httpParams = " + str(httpParams))
108  # logger.debug(">>> expiredData = " + str(self.lastModified))
109  # logger.debug(">>> algorithm = " + str(self.modifiedSettings["algorithm"]))
110  if httpParams is None:
111  httpParams = {}
112 
113  self.headersClearing(httpParams)
114 
115  localMethod = "get"
116  if self.modifiedSettings["algorithm"] == self.ALGO_AUTO:
117  if self.eTags is not None and self.eTags is not "":
118  eTagsString = self.generateETagsString(self.eTags)
119  httpParams["httpHeader"]["if-none-match"] = eTagsString
120 
121  if self.lastModified is not None:
122  httpParams["httpHeader"]["if-modified-since"] = self.lastModified
123 
124  elif self.modifiedSettings["algorithm"] == self.ALGO_IF_NOT_MATCH:
125  if self.eTags is not None and self.eTags is not "":
126  eTagsString = self.generateETagsString(self.eTags)
127  httpParams["httpHeader"]["if-none-match"] = eTagsString
128 
129  elif self.modifiedSettings["algorithm"] == self.ALGO_IF_MOFIFIED_SINCE:
130  if self.lastModified is not None:
131  httpParams["httpHeader"]["if-modified-since"] = self.lastModified
132 
133  elif self.modifiedSettings["algorithm"] == self.ALGO_HEAD:
134  if self.modifiedSettings["mode"] == self.MODE_ONE_REQUEST:
135  raise Exception(">>> Error [algorithm == 3 and mode == 1] not compatible !!!")
136  localMethod = "head"
137 
138  ret = BaseFetcher.get_fetcher(fetchType).open(httpParams["url"],
139  timeout=httpParams["httpTimeout"],
140  headers=httpParams["httpHeader"],
141  allow_redirects=httpParams["allowHttpRedirects"],
142  proxies=httpParams["proxies"], auth=httpParams["auth"],
143  data=httpParams["postData"], log=logger,
144  allowed_content_types=httpParams["processContentTypes"],
145  max_resource_size=httpParams["maxResourceSize"],
146  max_redirects=httpParams["maxHttpRedirects"],
147  method=localMethod)
148 
149  if ret is not None:
150  self.resourceComparing(ret)
151  if not self.isResourceNotChanged and self.modifiedSettings["mode"] == self.MODE_TWO_REQUESTS:
152  self.headersClearing(httpParams)
153  ret = BaseFetcher.get_fetcher(fetchType).open(httpParams["url"],
154  timeout=httpParams["httpTimeout"],
155  headers=httpParams["httpHeader"],
156  allow_redirects=httpParams["allowHttpRedirects"],
157  proxies=httpParams["proxies"], auth=httpParams["auth"],
158  data=httpParams["postData"], log=logger,
159  allowed_content_types=httpParams["processContentTypes"],
160  max_resource_size=httpParams["maxResourceSize"],
161  max_redirects=httpParams["maxHttpRedirects"])
162 
163  # if ret is not None and ret.request is not None and ret.request.headers is not None:
164  # logger.debug(">>> requests headers = " + str(ret.request.headers))
165 
166  return ret
167 
168 
Here is the call graph for this function:

◆ notModifiedStateProcessing()

def dc_crawler.DetectModified.DetectModified.notModifiedStateProcessing (   self,
  siteId,
  url,
  dbWrapper,
  defaultStatus = dc_event.URL.STATUS_CRAWLED,
  defaultUpdateUDate = True 
)

Definition at line 235 of file DetectModified.py.

235  defaultUpdateUDate=True):
236  # variables for result
237  status = defaultStatus
238  updateUDate = defaultUpdateUDate
239  if self.isResourceNotChanged:
240  status = dc_event.URL.STATUS_UNDEFINED
241  updateUDate = False
242  if self.getBehaviour() == self.BEHAVIOR_CRAWLED_STATUS:
243  status = dc_event.URL.STATUS_CRAWLED
244  elif self.getBehaviour() == self.BEHAVIOR_PROCESSED_STATUS:
245  urlContentObj = dc_event.URLContentRequest(siteId, url, dc_event.URLContentRequest.CONTENT_TYPE_PROCESSED)
246  urlContentResponse = dbWrapper.urlContent([urlContentObj])
247  if len(urlContentResponse.processedContents) > 0:
248  status = dc_event.URL.STATUS_PROCESSED
249  elif self.getBehaviour() == self.BEHAVIOR_SAVE_UDATE:
250  updateUDate = True
251 
252  return status, updateUDate
253 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ resourceComparing()

def dc_crawler.DetectModified.DetectModified.resourceComparing (   self,
  res 
)

Definition at line 169 of file DetectModified.py.

169  def resourceComparing(self, res):
170  self.isResourceNotChanged = False
171  if self.modifiedSettings["compare"] == self.COMPARE_AUTO:
172  if res.status_code == 304:
173  self.isResourceNotChanged = True
174  elif self.modifiedSettings["compare"] == self.COMPARE_DATE:
175  if "last-modified" in res.headers and self.prevContentDate is not None:
176  try:
177  resDate = time.mktime(rfc822.parsedate(res.headers["last-modified"]))
178  prevResDate = time.mktime(rfc822.parsedate(self.prevContentDate))
179  if resDate <= prevResDate:
180  self.isResourceNotChanged = True
181  except Exception:
182  raise Exception(">>> Bad data format - resDate -" + str(res.headers["last-modified"]) + " or prevResDate -" +
183  str(self.prevContentDate))
184  elif self.modifiedSettings["compare"] == self.COMPARE_CRC32:
185  if res.rendered_unicode_content is not None and \
186  hashlib.md5(res.rendered_unicode_content).hexdigest() == self.prevContentMd5:
187  self.isResourceNotChanged = True
188  elif self.modifiedSettings["compare"] == self.COMPARE_RAW_CONTENT:
189  if res.rendered_unicode_content is not None and len(res.rendered_unicode_content) == self.prevContentLen:
190  self.isResourceNotChanged = True
191 
192 
193 # def raiseExceptionIfNotModified(self):
194 # if self.isResourceNotChanged and self.modifiedSettings is not None and \
195 # self.modifiedSettings["behavior"] in [self.BEHAVIOR_CRAWLED_STATUS, self.BEHAVIOR_PROCESSED_STATUS, \
196 # self.BEHAVIOR_SAVE_UDATE]:
197 # raise NotModifiedException("Detect resource not modified state")
198 
199 
Here is the caller graph for this function:

Member Data Documentation

◆ ALGO_AUTO

int dc_crawler.DetectModified.DetectModified.ALGO_AUTO = 0
static

Definition at line 42 of file DetectModified.py.

◆ ALGO_HEAD

int dc_crawler.DetectModified.DetectModified.ALGO_HEAD = 3
static

Definition at line 45 of file DetectModified.py.

◆ ALGO_IF_MOFIFIED_SINCE

int dc_crawler.DetectModified.DetectModified.ALGO_IF_MOFIFIED_SINCE = 2
static

Definition at line 44 of file DetectModified.py.

◆ ALGO_IF_NOT_MATCH

int dc_crawler.DetectModified.DetectModified.ALGO_IF_NOT_MATCH = 1
static

Definition at line 43 of file DetectModified.py.

◆ BEHAVIOR_CRAWLED_STATUS

int dc_crawler.DetectModified.DetectModified.BEHAVIOR_CRAWLED_STATUS = 1
static

Definition at line 53 of file DetectModified.py.

◆ BEHAVIOR_DEFAULT

int dc_crawler.DetectModified.DetectModified.BEHAVIOR_DEFAULT = 0
static

Definition at line 52 of file DetectModified.py.

◆ BEHAVIOR_PROCESSED_STATUS

int dc_crawler.DetectModified.DetectModified.BEHAVIOR_PROCESSED_STATUS = 2
static

Definition at line 54 of file DetectModified.py.

◆ BEHAVIOR_SAVE_UDATE

int dc_crawler.DetectModified.DetectModified.BEHAVIOR_SAVE_UDATE = 3
static

Definition at line 55 of file DetectModified.py.

◆ COMPARE_AUTO

int dc_crawler.DetectModified.DetectModified.COMPARE_AUTO = 0
static

Definition at line 47 of file DetectModified.py.

◆ COMPARE_CRC32

int dc_crawler.DetectModified.DetectModified.COMPARE_CRC32 = 2
static

Definition at line 49 of file DetectModified.py.

◆ COMPARE_DATE

int dc_crawler.DetectModified.DetectModified.COMPARE_DATE = 1
static

Definition at line 48 of file DetectModified.py.

◆ COMPARE_RAW_CONTENT

int dc_crawler.DetectModified.DetectModified.COMPARE_RAW_CONTENT = 3
static

Definition at line 50 of file DetectModified.py.

◆ eTags

dc_crawler.DetectModified.DetectModified.eTags

Definition at line 67 of file DetectModified.py.

◆ isResourceNotChanged

dc_crawler.DetectModified.DetectModified.isResourceNotChanged

Definition at line 68 of file DetectModified.py.

◆ lastModified

dc_crawler.DetectModified.DetectModified.lastModified

Definition at line 66 of file DetectModified.py.

◆ MODE_DISABLED

int dc_crawler.DetectModified.DetectModified.MODE_DISABLED = 0
static

Definition at line 38 of file DetectModified.py.

◆ MODE_ONE_REQUEST

int dc_crawler.DetectModified.DetectModified.MODE_ONE_REQUEST = 1
static

Definition at line 39 of file DetectModified.py.

◆ MODE_TWO_REQUESTS

int dc_crawler.DetectModified.DetectModified.MODE_TWO_REQUESTS = 2
static

Definition at line 40 of file DetectModified.py.

◆ modifiedSettings

dc_crawler.DetectModified.DetectModified.modifiedSettings

Definition at line 63 of file DetectModified.py.

◆ prevContentDate

dc_crawler.DetectModified.DetectModified.prevContentDate

Definition at line 71 of file DetectModified.py.

◆ prevContentLen

dc_crawler.DetectModified.DetectModified.prevContentLen

Definition at line 69 of file DetectModified.py.

◆ prevContentMd5

dc_crawler.DetectModified.DetectModified.prevContentMd5

Definition at line 70 of file DetectModified.py.


The documentation for this class was generated from the following file: