HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_crawler.Fetcher.SeleniumFetcher Class Reference
Inheritance diagram for dc_crawler.Fetcher.SeleniumFetcher:
Collaboration diagram for dc_crawler.Fetcher.SeleniumFetcher:

Public Member Functions

def __init__ (self, tmpDirOptions=None, log=None)
 
def __del__ (self)
 
def open (self, url, method='get', headers=None, timeout=DEFAUIL_TIMEOUT, allow_redirects=True, proxies=None, auth=None, data=None, log=None, allowed_content_types=None, max_resource_size=None, max_redirects=1, filters=None, executable_path=None, depth=None, macro=None)
 
def openT (self, url, headers, timeout, proxies, executable_path, macro)
 
def cleanup (self, state=0, headers=None)
 
def chromeProcessesCleanup (self, headers)
 
def killProcess (self, pid, dirsTemplate=CHROME_DIRS_TEMPLATE, dirDeleteBeforeTimeout=DELAY_TERMINATE_AND_QUIT)
 
def should_have_meta_res (self)
 
def getProcessDirs (self, process, dirsTemplate)
 
def getOptions (self, webdriver, headers, proxies, url)
 
def initializeTmpDirs (self, headers)
 
def removeTmpDirs (self, delay=DELAY_TERMINATE_AND_QUIT, tries=3)
 
def execMacroSimple (self, macro)
 
def getAllLogsAsString (self)
 
def execMacroExtended (self, macro)
 
- Public Member Functions inherited from dc_crawler.Fetcher.BaseFetcher
def __init__ (self)
 
def open (self, url, method='get', headers=None, timeout=100, allow_redirects=True, proxies=None, auth=None, data=None, log=None, allowed_content_types=None, max_resource_size=None, max_redirects=CONSTS.MAX_HTTP_REDIRECTS_LIMIT, filters=None, executable_path=None, depth=None, macro=None)
 
def should_have_meta_res (self)
 
def getDomainNameFromURL (self, url, default='')
 

Public Attributes

 logger
 
 tmpDirPath
 
 tmpDirPrefix
 
 tmpDirSuffix
 
 tmpDirType
 
 tmpDirRemoveBeforeCreate
 
 tmpDir
 
 driver
 
 driverPid
 
 inlineURLMacroDelimiter
 
 sessionId
 
 userDataDirUsed
 
- Public Attributes inherited from dc_crawler.Fetcher.BaseFetcher
 connectionTimeout
 
 logger
 

Static Public Attributes

int DEFAUIL_TIMEOUT = 5
 
string CONTENT_TYPE_JSON = 'text/json'
 
string CONTENT_TYPE_HTML = 'text/html'
 
float DELAY_TERMINATE_AND_QUIT = 0.5
 
int ERROR_FATAL = 1
 
int ERROR_GENERAL = 2
 
int ERROR_CONTENT_OR_COOKIE = 3
 
int ERROR_NAME_NOT_RESOLVED = 400
 
int ERROR_TOO_MANY_REDIRECTS = 11
 
int ERROR_MACRO_RETURN_VALUE = 12
 
int ERROR_PROXY_CONNECTION_FAILED = 504
 
int ERROR_CONNECTION_TIMED_OUT = 505
 
int ERROR_TUNNEL_CONNECTION_FAILED = 403
 
int ERROR_SERVICE_UNAVAILABLE = 503
 
int ERROR_CONFLICT = 409
 
int ERROR_EMPTY_RESPONSE = 13
 
string LOG_MESSAGE_RENDERRER_TIMEOUT = 'Timed out receiving message from renderer'
 
string LOG_MESSAGE_SERVER_RESPONSE_503 = 'server responded with a status of 503'
 
string LOG_MESSAGE_SERVER_RESPONSE_409 = 'server responded with a status of 409 (Conflict)'
 
list CHROME_PROCESS_NAMES = ['chrome', 'BrowserBlocking']
 
string CHROME_DIRS_TEMPLATE = '.google.Chrome.'
 
string CHROME_DEBUG_LOG_NAME = 'chrome_debug.log'
 
int MACRO_RESULT_TYPE_DEFAULT = 0
 
int MACRO_RESULT_TYPE_URLS_LIST = 1
 
int MACRO_RESULT_TYPE_CONTENT = 2
 
int MACRO_RESULT_TYPE_AUTO = 3
 
int TMP_DIR_TYPE_OPEN = 0
 
int TMP_DIR_TYPE_INSTANTIATE = 1
 
- Static Public Attributes inherited from dc_crawler.Fetcher.BaseFetcher
 fetchers = None
 
int TYP_NORMAL = 1
 
int TYP_DYNAMIC = 2
 
int TYP_URLLIB = 5
 
int TYP_CONTENT = 6
 
int TYP_AUTO = 7
 
float CONNECTION_TIMEOUT = 1.0
 

Additional Inherited Members

- Static Public Member Functions inherited from dc_crawler.Fetcher.BaseFetcher
def init (dbWrapper=None, siteId=None)
 
def get_fetcher (typ, dbWrapper=None, siteId=None)
 

Detailed Description

Definition at line 397 of file Fetcher.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_crawler.Fetcher.SeleniumFetcher.__init__ (   self,
  tmpDirOptions = None,
  log = None 
)

Definition at line 437 of file Fetcher.py.

437  def __init__(self, tmpDirOptions=None, log=None):
438  super(SeleniumFetcher, self).__init__()
439 
440  if log is not None:
441  self.logger = log
442 
443  if self.logger:
444  self.logger.debug("Initialization of instance, tmpDirOptions: %s", str(tmpDirOptions))
445 
446  self.tmpDirPath = '/tmp'
447  self.tmpDirPrefix = 'dfetcher_tmp_%PID%'
448  self.tmpDirSuffix = ''
449  self.tmpDirType = self.TMP_DIR_TYPE_OPEN
450  self.tmpDirRemoveBeforeCreate = True
451  if tmpDirOptions is not None:
452  if 'path' in tmpDirOptions:
453  self.tmpDirPath = tmpDirOptions['path']
454  if 'prefix' in tmpDirOptions:
455  self.tmpDirPrefix = tmpDirOptions['prefix']
456  if 'suffix' in tmpDirOptions:
457  self.tmpDirSuffix = tmpDirOptions['suffix']
458  if 'type' in tmpDirOptions:
459  self.tmpDirType = int(tmpDirOptions['type'])
460  if 'remove_before_create' in tmpDirOptions:
461  self.tmpDirRemoveBeforeCreate = bool(int(tmpDirOptions['remove_before_create']))
462  pid = str(os.getpid()).strip()
463 
464  if self.tmpDirPath == '' and self.tmpDirPrefix == '' and self.tmpDirSuffix == '':
465  self.tmpDir = ''
466  else:
467  self.tmpDir = self.tmpDirPath + '/' + self.tmpDirPrefix.replace('%PID%', pid) + \
468  self.tmpDirSuffix.replace('%PID%', pid)
469  if self.tmpDirType == self.TMP_DIR_TYPE_INSTANTIATE:
470  if not self.initializeTmpDirs(None):
471  msg = 'Temporary directory type INSTANTIATE `%s` initialization error!', self.tmpDir
472  if self.logger is not None:
473  self.logger.error(msg)
474  raise SeleniumFetcherException(msg)
475  else:
476  if self.logger is not None:
477  self.logger.debug("Temporary directory type INSTANTIATE `%s` initialized!", self.tmpDir)
478  self.driver = None
479  self.driverPid = 0
480  self.inlineURLMacroDelimiter = '###'
481  self.sessionId = '--sessionId=' + str(pid)
482  self.userDataDirUsed = ''
483 
484 
-mask-info
def __init__(self)
constructor
Definition: UIDGenerator.py:19

◆ __del__()

def dc_crawler.Fetcher.SeleniumFetcher.__del__ (   self)

Definition at line 487 of file Fetcher.py.

487  def __del__(self):
488  if self.logger:
489  self.logger.debug("Delete instance, temporary dir type: %s", str(self.tmpDirType))
490 
491  if self.tmpDirType == self.TMP_DIR_TYPE_INSTANTIATE:
492  self.removeTmpDirs()
493 
494 
Here is the call graph for this function:

Member Function Documentation

◆ chromeProcessesCleanup()

def dc_crawler.Fetcher.SeleniumFetcher.chromeProcessesCleanup (   self,
  headers 
)

Definition at line 960 of file Fetcher.py.

960  def chromeProcessesCleanup(self, headers):
961  if self.logger:
962  self.logger.debug("Chrome processes cleanup started")
963 
964  if self.sessionId != '':
965  key = self.sessionId
966  else:
967  if self.tmpDir == '':
968  if '--disk-cache-dir' in headers:
969  key = '--disk-cache-dir=' + headers['--disk-cache-dir']
970  if '--profile-directory' in headers:
971  key = '--profile-directory=' + headers['--profile-directory']
972  if '--user-data-dir' in headers:
973  key = '--user-data-dir=' + headers['--user-data-dir']
974  else:
975  key = self.tmpDir
976 
977  try:
978  for proc in psutil.process_iter():
979  try:
980  # if self.logger:
981  # self.logger.debug("Candidate, pid:%s, name: %s cmdline: %s", str(proc.pid), str(proc.name()),
982  # str(proc.cmdline()))
983  for name in self.CHROME_PROCESS_NAMES:
984  if name in proc.name():
985  found = False
986  for item in proc.cmdline():
987  if key in item:
988  found = True
989  break
990  if found:
991  if self.logger:
992  self.logger.debug("Chrome process killing, pid:%s, cmdline: %s", str(proc.pid), str(proc.cmdline()))
993  self.killProcess(proc.pid, self.CHROME_DIRS_TEMPLATE, self.DELAY_TERMINATE_AND_QUIT)
994  except Exception, err:
995  if self.logger:
996  self.logger.debug("Chrome process kill error: %s", str(err))
997  except Exception, err:
998  if self.logger:
999  self.logger.debug("Chrome process kill error: %s", str(err))
1000 
1001 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ cleanup()

def dc_crawler.Fetcher.SeleniumFetcher.cleanup (   self,
  state = 0,
  headers = None 
)

Definition at line 915 of file Fetcher.py.

915  def cleanup(self, state=0, headers=None):
916  if self.logger is not None and '--log-chrome-debug-log' in headers:
917  logFile = self.userDataDirUsed + '/' + self.CHROME_DEBUG_LOG_NAME
918  try:
919  with open(logFile, 'r') as f:
920  logData = f.read()
921  self.logger.debug("Chrome debug log file `%s`:\n%s", logFile, logData)
922  except Exception, err:
923  self.logger.debug("Error read chrome debug log file `%s`: %s", logFile, str(err))
924 
925  if self.logger is not None:
926  self.logger.debug("Cleanup type: %s, driver: %s", str(state), str(self.driver))
927 
928  try:
929  if self.driver is not None:
930  self.driver.quit()
931  time.sleep(self.DELAY_TERMINATE_AND_QUIT)
932  except Exception:
933  pass
934 
935  if state == 1:
936  time.sleep(self.DELAY_TERMINATE_AND_QUIT)
937  try:
938  if self.driver is not None:
939  self.driver.quit()
940  except Exception:
941  pass
942  time.sleep(self.DELAY_TERMINATE_AND_QUIT)
943  try:
944  if self.logger:
945  self.logger.debug("Driver pid: " + str(self.driverPid))
946  self.killProcess(self.driverPid)
947  except Exception:
948  if self.logger:
949  self.logger.debug("Error kill driver pid: %s", str(self.driverPid))
950 
951  self.chromeProcessesCleanup(headers)
952 
953  if self.tmpDirType == self.TMP_DIR_TYPE_OPEN:
954  self.removeTmpDirs(self.DELAY_TERMINATE_AND_QUIT)
955 
956 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ execMacroExtended()

def dc_crawler.Fetcher.SeleniumFetcher.execMacroExtended (   self,
  macro 
)

Definition at line 1446 of file Fetcher.py.

1446  def execMacroExtended(self, macro):
1447  macroResults = []
1448  error_code = 0
1449  error_msg = ''
1450  content_type = None
1451  result_type = self.MACRO_RESULT_TYPE_DEFAULT
1452 
1453  for mset in macro['sets']:
1454  if 'name' not in mset:
1455  mset['name'] = ''
1456  if 'repeat' not in mset:
1457  mset['repeat'] = '1'
1458  if 'delay' not in mset:
1459  mset['delay'] = '0'
1460  if self.logger is not None:
1461  self.logger.debug("Set:\n%s", str(mset))
1462  for i in xrange(0, int(mset['repeat'])):
1463  if int(mset['delay']) > 0:
1464  time.sleep(int(mset['delay']))
1465  if self.logger is not None:
1466  self.logger.debug("Macro %s in set", str(i))
1467  r, error_code, error_msg = self.execMacroSimple(mset['items'])
1468  if error_code > 0:
1469  break
1470  macroResults += r
1471  if error_code > 0:
1472  break
1473 
1474  if 'result_type' in macro:
1475  result_type = int(macro['result_type'])
1476  self.logger.debug("Macro results type: %s", str(result_type))
1477  if 'result_content_type' in macro:
1478  content_type = str(macro['result_content_type'])
1479  self.logger.debug("Macro results content type: %s", str(content_type))
1480 
1481  if result_type == self.MACRO_RESULT_TYPE_AUTO:
1482  self.logger.debug("Macro results before autodetect type: %s", str(macroResults))
1483  for r in macroResults:
1484  if isinstance(r, basestring):
1485  result_type = self.MACRO_RESULT_TYPE_CONTENT
1486  self.logger.debug("Macro results type autodetected as string content")
1487  break
1488  elif isinstance(r, list):
1489  for ri in r:
1490  if isinstance(ri, basestring):
1491  result_type = self.MACRO_RESULT_TYPE_URLS_LIST
1492  self.logger.debug("Macro results type autodetected as URLs list")
1493  break
1494  if result_type == self.MACRO_RESULT_TYPE_CONTENT:
1495  macroResults = ''.join(macroResults) # pylint: disable=R0204
1496  if result_type == self.MACRO_RESULT_TYPE_URLS_LIST:
1497  macroResults = [item for sublist in macroResults for item in sublist]
1498  self.logger.debug("Macro results after autodetect type: %s", str(macroResults))
1499 
1500  return macroResults, error_code, error_msg, content_type, result_type
1501 
1502 
1503 
1504 # # external Fetcher
1505 #
1506 #
1507 
1508 # # urllib Fetcher
1509 #
1510 #
Definition: join.py:1
Here is the call graph for this function:
Here is the caller graph for this function:

◆ execMacroSimple()

def dc_crawler.Fetcher.SeleniumFetcher.execMacroSimple (   self,
  macro 
)

Definition at line 1336 of file Fetcher.py.

1336  def execMacroSimple(self, macro):
1337  macroResults = []
1338  error_code = 0
1339  error_msg = ''
1340  macroCounter = 0
1341  maxLenToLog = 512
1342 
1343  for m in macro:
1344  if self.logger is not None:
1345  self.logger.debug("Macro #%s in set of %s items:\n%s...",
1346  str(macroCounter), str(len(macro)), str(m)[:maxLenToLog])
1347  macroCounter += 1
1348  if m.isdigit():
1349  if self.logger is not None:
1350  self.logger.debug("Macro sleep: %s sec", str(m))
1351  time.sleep(int(m))
1352  else:
1353  iType = 0
1354  iDelay = 0
1355  iMaxIterations = 1
1356  if m.startswith('!'):
1357  m = m[1:]
1358  iType = 1
1359  params = m.split(':')
1360  if len(params) > 2:
1361  iDelay = int(params[0])
1362  iMaxIterations = int(params[1])
1363  m = params[2]
1364  elif len(params) > 1:
1365  iDelay = int(params[0])
1366  m = params[1]
1367  elif len(params) == 1:
1368  m = params[0]
1369  if self.logger is not None:
1370  self.logger.debug("Macro blocking iterative, delay: %s, max ierations: %s",
1371  str(iDelay), str(iMaxIterations))
1372  for i in xrange(0, iMaxIterations):
1373  if iType == 1:
1374  if self.logger is not None:
1375  self.logger.debug("Macro blocking iteration: %s of: %s", str(i + 1), str(iMaxIterations))
1376  if m.startswith('http://') or m.startswith('https://') or m.startswith('file://'):
1377  try:
1378  if m.startswith('file://'):
1379  with open(m[7:].replace('%PID%', str(os.getpid())), 'r') as f:
1380  m = f.read()
1381  else:
1382  r = requests.get(m.replace('%PID%', str(os.getpid())))
1383  m = r.text
1384  if self.logger is not None:
1385  self.logger.debug("Macro %s bytes loaded:\n%s...", str(len(str(m))), str(m)[:maxLenToLog])
1386  except Exception, err:
1387  error_msg = 'Error load macro code, URL: `' + str(m) + '` : ' + str(err)
1388  error_code = self.ERROR_MACRO_RETURN_VALUE
1389  if self.logger is not None:
1390  self.logger.debug(error_msg)
1391  r = None
1392  break
1393  try:
1394  r = self.driver.execute_script(m)
1395  if self.logger is not None:
1396  self.logger.debug("Macro returned: %s", json.dumps(r))
1397  except Exception, err:
1398  error_msg = 'Error macro execution: ' + str(err) + '; logs: ' + self.getAllLogsAsString()
1399  error_code = self.ERROR_MACRO_RETURN_VALUE
1400  if self.logger is not None:
1401  self.logger.debug(error_msg)
1402  r = None
1403  break
1404  if iType == 0 and r is not None:
1405  if isinstance(r, (basestring, list, dict)):
1406  macroResults.append(r)
1407  if isinstance(r, (list, dict)):
1408  if self.logger is not None:
1409  self.logger.debug("Macro items returned: %s", str(len(r)))
1410  else:
1411  error_msg = 'Error macro result value, type is: ' + str(type(r))
1412  error_code = self.ERROR_MACRO_RETURN_VALUE
1413  if self.logger is not None:
1414  self.logger.debug(error_msg)
1415  break
1416  elif iType == 1:
1417  if r is True:
1418  if self.logger is not None:
1419  self.logger.debug("Macro blocking got `True` on iteration: %s, sleeped: %s sec",
1420  str(i + 1), str(int(iDelay) * i))
1421  break
1422  elif r is not True and iDelay > 0:
1423  if self.logger is not None:
1424  self.logger.debug("Macro blocking iteration: %s sleep on: %s sec", str(i + 1), str(iDelay))
1425  time.sleep(int(iDelay))
1426  if iType == 1 and r is not True:
1427  if self.logger is not None:
1428  self.logger.debug("Macro blocking finished, but no `True` value returned!")
1429  if error_code > 0:
1430  break
1431 
1432  return macroResults, error_code, error_msg
1433 
1434 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ getAllLogsAsString()

def dc_crawler.Fetcher.SeleniumFetcher.getAllLogsAsString (   self)

Definition at line 1438 of file Fetcher.py.

1438  def getAllLogsAsString(self):
1439  return 'browser: ' + str(self.driver.get_log('browser')) + '; driver: ' + str(self.driver.get_log('driver'))
1440 
1441 
Here is the caller graph for this function:

◆ getOptions()

def dc_crawler.Fetcher.SeleniumFetcher.getOptions (   self,
  webdriver,
  headers,
  proxies,
  url 
)

Definition at line 1076 of file Fetcher.py.

1076  def getOptions(self, webdriver, headers, proxies, url):
1077  chrome_option = webdriver.ChromeOptions()
1078 
1079  arg_disable_http_cache = "--disable-http-cache"
1080  arg_clear_data_reduction_proxy_data_savings = '--clear-data-reduction-proxy-data-savings'
1081  arg_host_resolver_retry_attempts = '--host-resolver-retry-attempts=0'
1082  arg_start_maximized = '--start-maximized'
1083  if headers is not None and '--use-mobile-user-agent' in headers:
1084  use_mobile_user_agent = '--use-mobile-user-agent'
1085  else:
1086  use_mobile_user_agent = None
1087 
1088  if headers is not None and '--disable-web-security' in headers:
1089  disable_web_security = '--disable-web-security'
1090  else:
1091  disable_web_security = None
1092  # disable_web_security = '--disable-web-security'
1093 
1094  if headers is not None and '--allow-running-insecure-content' in headers:
1095  allow_running_insecure_content = '--allow-running-insecure-content'
1096  else:
1097  allow_running_insecure_content = None
1098  # allow_running_insecure_content = '--allow-running-insecure-content'
1099 
1100  if headers is not None and '--allow-file-access-from-files' in headers:
1101  allow_file_access_from_files = '--allow-file-access-from-files'
1102  else:
1103  allow_file_access_from_files = None
1104  # allow_file_access_from_files = '--allow-file-access-from-files'
1105 
1106  if headers is not None and '--proxy-bypass-list' in headers:
1107  arg_proxy_bypass_list = '--proxy-bypass-list=' + headers['--proxy-bypass-list']
1108  else:
1109  arg_proxy_bypass_list = None
1110 
1111  # if headers is not None and 'User-Agent' in headers and '--user-agent' in headers and\
1112  if headers is not None and 'User-Agent' in headers and\
1113  '--use-mobile-user-agent' not in headers:
1114  arg_user_agent = '--user-agent=' + headers['User-Agent']
1115  else:
1116  arg_user_agent = None
1117  if '--disk-cache-dir' in headers:
1118  if os.path.isdir(headers['--disk-cache-dir']):
1119  arg_disk_cache_dir = '--disk-cache-dir=' + headers['--disk-cache-dir']
1120  else:
1121  if self.logger:
1122  self.logger.debug("Header `--disk-cache-dir` directory: `%s` not found!", headers['--disk-cache-dir'])
1123  else:
1124  arg_disk_cache_dir = None
1125  if '--profile-directory' in headers:
1126  if os.path.isdir(headers['--profile-directory']):
1127  arg_profile_directory = '--profile-directory=' + headers['--profile-directory']
1128  else:
1129  if self.logger:
1130  self.logger.debug("Header `--profile-directory` directory: `%s` not found!", headers['--profile-directory'])
1131  else:
1132  arg_profile_directory = None
1133  if '--user-data-dir' in headers:
1134  if os.path.isdir(headers['--user-data-dir']):
1135  arg_user_data_dir = '--user-data-dir=' + headers['--user-data-dir']
1136  self.userDataDirUsed = headers['--user-data-dir']
1137  else:
1138  if self.logger:
1139  self.logger.debug("Header `--user-data-dir` directory: `%s` not found!", headers['--user-data-dir'])
1140  else:
1141  if self.tmpDir != '':
1142  arg_user_data_dir = '--user-data-dir=' + self.tmpDir
1143  self.userDataDirUsed = self.tmpDir
1144  else:
1145  arg_user_data_dir = None
1146  if self.logger:
1147  self.logger.error("Empty tmp dir configured!")
1148 
1149  if self.userDataDirUsed != '' and not os.path.isdir(self.userDataDirUsed):
1150  if self.logger:
1151  self.logger.debug("Profile archive user data dir `%s` not found, trying to create...",
1152  str(self.userDataDirUsed))
1153  try:
1154  os.makedirs(self.userDataDirUsed)
1155  except Exception, err:
1156  if self.logger:
1157  self.logger.debug("Profile archive user data dir creation error: %s", str(err))
1158  if os.path.isdir(self.userDataDirUsed):
1159  if self.logger:
1160  self.logger.debug("Profile archive user data dir `%s` created", str(self.userDataDirUsed))
1161 
1162  if '--user-data-dir-zip' in headers and self.userDataDirUsed != '' and os.path.isdir(self.userDataDirUsed):
1163  try:
1164  profiles = [p.strip() for p in headers['--user-data-dir-zip'].split(',') if p.strip() != '']
1165  if '--user-data-dir-zip-rotation' in headers and headers['--user-data-dir-zip-rotation'] is not None and\
1166  headers['--user-data-dir-zip-rotation'] != '':
1167  rotationType = int(headers['--user-data-dir-zip'])
1168  else:
1169  rotationType = 0
1170  profileIndex = 0
1171  if len(profiles) > 1:
1172  if rotationType == 0:
1173  r = [randint(0, len(profiles) - 1) for p in range(0, len(profiles) - 1)]
1174  profileIndex = r[0]
1175  elif rotationType == 1:
1176  pass
1177  elif rotationType == 2:
1178  pass
1179 # os.system('unzip -qq ' + profiles[profileIndex] + ' -d ' + self.userDataDirUsed)
1180 # os.system('mv ' + self.userDataDirUsed + '/' + \
1181 # os.path.splitext(os.path.basename(profiles[profileIndex]))[0] + \
1182 # '/* ' + self.userDataDirUsed)
1183 
1184  res = Utils.executeCommand('unzip -qq ' + profiles[profileIndex] + ' -d ' + self.userDataDirUsed)
1185  if res.exitCode != APP_CONSTS.EXIT_SUCCESS:
1186  raise Exception(str(res.stderr))
1187 
1188  res = Utils.executeCommand('mv ' + self.userDataDirUsed + '/' + \
1189  os.path.splitext(os.path.basename(profiles[profileIndex]))[0] + \
1190  '/* ' + self.userDataDirUsed)
1191  if res.exitCode != APP_CONSTS.EXIT_SUCCESS:
1192  raise Exception(str(res.stderr))
1193 
1194  if self.logger:
1195  self.logger.debug("Profile archive `%s` extracted to `%s` directory, rotation: %s",
1196  profiles[profileIndex], self.userDataDirUsed, str(rotationType))
1197  except Exception, err:
1198 # if self.logger:
1199 # self.logger.error("Profile archive extraction error: %s", str(err))
1200  raise Exception("Profile archive extraction error: %s" % str(err))
1201 
1202  else:
1203  d = {'--user-data-dir-zip in headers':str('--user-data-dir-zip' in headers),
1204  'self.userDataDirUsed':self.userDataDirUsed,
1205  'os.path.isdir(self.userDataDirUsed)':str(os.path.isdir(self.userDataDirUsed))}
1206  if self.logger:
1207  self.logger.debug("Profile archive not used, condition data:\n%s", str(d))
1208 
1209  arg_dns_prefetch_disable = '--dns-prefetch-disable'
1210  # --disk-cache-size=1
1211  # --media-cache-size=1
1212  # --safe-plugins
1213 
1214  if headers is not None and '--disable-setuid-sandbox' in headers:
1215  chrome_option.add_argument('--disable-setuid-sandbox')
1216 
1217  if headers is not None and '--no-sandbox' in headers:
1218  chrome_option.add_argument('--no-sandbox')
1219 
1220  if headers is not None and '--incognito' in headers:
1221  chrome_option.add_argument('--incognito')
1222 
1223  # chrome_option.add_argument('--enable-logging')
1224  # chrome_option.add_argument('--v=1')
1225  # chrome_option.add_argument('--log-level=0')
1226 
1227  if arg_user_agent is not None:
1228  chrome_option.add_argument(arg_user_agent)
1229  if use_mobile_user_agent is not None:
1230  chrome_option.add_argument(use_mobile_user_agent)
1231  if disable_web_security is not None:
1232  chrome_option.add_argument(disable_web_security)
1233  if allow_running_insecure_content is not None:
1234  chrome_option.add_argument(allow_running_insecure_content)
1235  if allow_file_access_from_files is not None:
1236  chrome_option.add_argument(allow_file_access_from_files)
1237  if arg_proxy_bypass_list is not None:
1238  chrome_option.add_argument(arg_proxy_bypass_list)
1239  chrome_option.add_argument(arg_disable_http_cache)
1240  chrome_option.add_argument(arg_clear_data_reduction_proxy_data_savings)
1241  chrome_option.add_argument(arg_host_resolver_retry_attempts)
1242  chrome_option.add_argument(arg_start_maximized)
1243  if arg_disk_cache_dir is not None and arg_disk_cache_dir != '':
1244  chrome_option.add_argument(arg_disk_cache_dir)
1245  if arg_profile_directory is not None and arg_profile_directory != '':
1246  chrome_option.add_argument(arg_profile_directory)
1247  if arg_user_data_dir is not None and arg_user_data_dir != '':
1248  chrome_option.add_argument(arg_user_data_dir)
1249  chrome_option.add_argument(arg_dns_prefetch_disable)
1250  # chrome_option.add_argument(arg_incognito)
1251 
1252  if self.sessionId != '':
1253  chrome_option.add_argument(self.sessionId)
1254 
1255  # Proxy options
1256  if proxies is not None:
1257  proxy_type, proxy_host, proxy_port, proxy_user, proxy_passwd = proxies
1258  if self.logger:
1259  self.logger.debug("Proxy used from argument tuple: %s", str(proxies))
1260  if proxy_user:
1261  proxies = proxy_type + "://%s:%s@%s:%s" % (proxy_user, proxy_passwd, proxy_host, proxy_port)
1262  else:
1263  proxies = proxy_type + "://%s:%s" % (proxy_host, proxy_port)
1264  chrome_option.add_argument("--proxy-server=" + proxies)
1265  else:
1266  if '--proxy-http' in headers and headers['--proxy-http'] is not None and headers['--proxy-http'] != '':
1267  if '--proxy-http-domains' in headers and headers['--proxy-http-domains'] is not None and\
1268  headers['--proxy-http-domains'] != '':
1269  dn = self.getDomainNameFromURL(url)
1270  domain = bool(dn in headers['--proxy-http-domains'].split(','))
1271  if self.logger and domain is False:
1272  self.logger.debug("Proxy not used because domain `%s` not listed in `--proxy-http-domains`", str(dn))
1273  else:
1274  domain = True
1275  if domain:
1276  p = headers['--proxy-http'].replace('%3A', ':')
1277  if self.logger:
1278  self.logger.debug("Proxy used from header: %s", str(p))
1279  chrome_option.add_argument("--proxy-server=" + p)
1280 
1281  return chrome_option
1282 
1283 
-mask-info
Here is the call graph for this function:
Here is the caller graph for this function:

◆ getProcessDirs()

def dc_crawler.Fetcher.SeleniumFetcher.getProcessDirs (   self,
  process,
  dirsTemplate 
)

Definition at line 1050 of file Fetcher.py.

1050  def getProcessDirs(self, process, dirsTemplate):
1051  ret = []
1052 
1053  for f in process.open_files():
1054  # if self.logger:
1055  # self.logger.debug("Path candidate: %s", str(f.path))
1056  fp = f.path.split('/')
1057  fpr = ''
1058  templateFound = False
1059  for item in fp:
1060  fpr += '/' + item
1061  if dirsTemplate is not None and dirsTemplate != '' and dirsTemplate in item:
1062  templateFound = True
1063  break
1064  if templateFound or dirsTemplate is None or dirsTemplate == '':
1065  ret.append(fpr)
1066 
1067  return ret
1068 
1069 

◆ initializeTmpDirs()

def dc_crawler.Fetcher.SeleniumFetcher.initializeTmpDirs (   self,
  headers 
)

Definition at line 1289 of file Fetcher.py.

1289  def initializeTmpDirs(self, headers):
1290  ret = True
1291 
1292  if self.tmpDir != '':
1293  try:
1294  if headers is not None and 'tmp-dir' in headers:
1295  self.tmpDir = headers['tmp-dir']
1296  if self.tmpDirRemoveBeforeCreate:
1297  self.removeTmpDirs()
1298  if not os.path.isdir(self.tmpDir):
1299  if logger is not None:
1300  self.logger.debug("Create temporary directory: %s", str(self.tmpDir))
1301  os.makedirs(self.tmpDir)
1302  except Exception, err:
1303  if self.logger is not None:
1304  ret = False
1305  if logger is not None:
1306  self.logger.debug("Error temporary directories initialization: %s", str(err))
1307 
1308  if os.path.isdir(self.tmpDir):
1309  ret = True
1310 
1311  return ret
1312 
1313 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ killProcess()

def dc_crawler.Fetcher.SeleniumFetcher.killProcess (   self,
  pid,
  dirsTemplate = CHROME_DIRS_TEMPLATE,
  dirDeleteBeforeTimeout = DELAY_TERMINATE_AND_QUIT 
)

Definition at line 1002 of file Fetcher.py.

1002  def killProcess(self, pid, dirsTemplate=CHROME_DIRS_TEMPLATE, dirDeleteBeforeTimeout=DELAY_TERMINATE_AND_QUIT):
1003  del dirsTemplate, dirDeleteBeforeTimeout
1004  try:
1005  if self.logger:
1006  self.logger.debug("Try to Kill process pid: %s", str(pid))
1007  process = psutil.Process(pid)
1008  for proc in process.children(recursive=True):
1009  if self.logger:
1010  self.logger.debug("Killing child process pid: %s", str(proc.pid))
1011  try:
1012  # dirs = self.getProcessDirs(proc, dirsTemplate)
1013  proc.kill()
1014  # if self.logger:
1015  # self.logger.debug("Dirs to remove: %s", str(dirs))
1016  # for d in dirs:
1017  # time.sleep(dirDeleteBeforeTimeout)
1018  # self.removeTmpDirs(d)
1019  # os.kill(pid, signal.SIGKILL)
1020  except Exception, err:
1021  if self.logger:
1022  self.logger.debug("Child process pid: %s kill error: ", str(pid), str(err))
1023  if self.logger:
1024  self.logger.debug("Killing main process pid: %s", str(process.pid))
1025  # dirs = self.getProcessDirs(process, dirsTemplate)
1026  process.kill()
1027  # if self.logger:
1028  # self.logger.debug("Dirs to remove: %s", str(dirs))
1029  # for d in dirs:
1030  # time.sleep(dirDeleteBeforeTimeout)
1031  # self.removeTmpDirs(d)
1032  # os.kill(pid, signal.SIGKILL)
1033  except Exception, err:
1034  if self.logger:
1035  self.logger.debug("Process pid: %s kill error: %s", str(pid), str(err))
1036 
1037 
Here is the caller graph for this function:

◆ open()

def dc_crawler.Fetcher.SeleniumFetcher.open (   self,
  url,
  method = 'get',
  headers = None,
  timeout = DEFAUIL_TIMEOUT,
  allow_redirects = True,
  proxies = None,
  auth = None,
  data = None,
  log = None,
  allowed_content_types = None,
  max_resource_size = None,
  max_redirects = 1,
  filters = None,
  executable_path = None,
  depth = None,
  macro = None 
)

Definition at line 527 of file Fetcher.py.

527  macro=None):
528 
529  if log is not None:
530  self.logger = log
531 
532  if self.logger is not None:
533  self.logger.debug("Dynamic fetcher call:\nurl:" + str(url) + \
534  "\nmethod:" + str(method) + "\nheaders:" + str(headers) + "\ntimeout:" + str(timeout) + \
535  "\nallow_redirects:" + str(allow_redirects) + "\nproxies:" + str(proxies) + "\nauth:" + \
536  str(auth) + "\ndata:" + str(data) + "\nlogger:" + str(self.logger) + \
537  "\nallowed_content_types:" + str(allowed_content_types) + "\nmax_resource_size:" + \
538  str(max_resource_size) + "\nmax_redirects:" + str(max_redirects) + "\nexecutable_path:" + \
539  str(executable_path) + "\ncur_dir:" + str(os.getcwd()) + "\nmacro:" + str(macro))
540 
541  t1 = 0
542  if isinstance(timeout, tuple):
543  t = int(timeout[0])
544  if isinstance(timeout[0], float):
545  t1 = int(str(timeout[0]).strip()[str(timeout[0]).strip().find('.') + 1:])
546  else:
547  t = int(timeout)
548  if isinstance(timeout, float):
549  t1 = int(str(timeout).strip()[str(timeout).strip().find('.') + 1:])
550  if self.logger is not None:
551  self.logger.debug("Execution timeout: %s, damping timeout: %s", str(t), str(t1))
552  if t1 >= t:
553  msg = "Execution timeout: %s less or equal than damping timeout: %s, aborted" % (str(t), str(t1))
554  if self.logger is not None:
555  self.logger.error(msg)
556  raise SeleniumFetcherException(msg)
557 
558  if self.tmpDirType == self.TMP_DIR_TYPE_OPEN:
559  if not self.initializeTmpDirs(headers):
560  msg = 'Temporary directory type OPEN `%s` initialization error!' % self.tmpDir
561  if self.logger is not None:
562  self.logger.error(msg)
563  raise SeleniumFetcherException(msg)
564  else:
565  if self.logger is not None:
566  self.logger.debug('Temporary directory type OPEN `%s` initialized', self.tmpDir)
567 
568  from app.Utils import executeWithTimeout
569  try:
570  ret = executeWithTimeout(func=self.openT, args=(url, headers, t1, proxies, executable_path, macro,),
571  timeout=t, log=self.logger)
572  if ret is None:
573  if self.logger is not None:
574  msg = 'Execution timeout: ' + str(t) + ' reached!'
575  self.logger.error(msg)
576  raise SeleniumFetcherException(msg, APP_CONSTS.ERROR_FETCH_TIMEOUT)
577  except SeleniumFetcherException, err:
578  if self.logger is not None:
579  self.logger.error("Error SeleniumFetcherException: %s", str(err))
580  self.cleanup(1, headers)
581  raise err
582  except Exception, err:
583  if self.logger is not None:
584  msg = 'Execution with timeout error:' + str(err)
585  self.logger.error(msg)
586  self.cleanup(1, headers)
587  raise SeleniumFetcherException(msg)
588  finally:
589  self.cleanup(0, headers)
590 
591  if self.logger is not None:
592  self.logger.debug("Dynamic fetcher call finished normally.")
593 
594  return ret
595 
596 
597 
def executeWithTimeout(func, args=None, kwargs=None, timeout=1, default=None, log=None)
Definition: Utils.py:1544
-mask-info
Here is the call graph for this function:
Here is the caller graph for this function:

◆ openT()

def dc_crawler.Fetcher.SeleniumFetcher.openT (   self,
  url,
  headers,
  timeout,
  proxies,
  executable_path,
  macro 
)

Definition at line 607 of file Fetcher.py.

607  def openT(self, url, headers, timeout, proxies, executable_path, macro):
608  startTime = time.time()
609  inlineMacro = ''
610 
611  try:
612  # Prepare inline macro
613  if self.inlineURLMacroDelimiter in url:
614  t = url.split(self.inlineURLMacroDelimiter)
615  url = t[0]
616  inlineMacro = t[1]
617  # Dependent import
618  try:
619  from selenium import webdriver
620  import selenium.webdriver.support.ui # pylint: disable=W0611
621  except Exception, err:
622  msg = 'Selenium module import error: ' + str(err)
623  if self.logger is not None:
624  self.logger.error(msg)
625  raise SeleniumFetcherException(msg)
626 
627  if self.logger is not None:
628  # One way
629  from selenium.webdriver.remote.remote_connection import LOGGER as seleniumLogger
630  seleniumLogger.setLevel(self.logger.getEffectiveLevel())
631  # Second way
632  selenium_logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
633  # Only display possible problems
634  # selenium_logger.setLevel(logging.WARNING)
635  selenium_logger.setLevel(self.logger.getEffectiveLevel())
636 
637  # Initialize defaults
638  exec_path = "./"
639  driver_name = "chromedriver"
640  error_msg = ""
641  error_code = 0
642  error_code_macro = 0
643  page_source_macro = None
644  content_type_macro = None
645  result_type_macro = self.MACRO_RESULT_TYPE_DEFAULT
646  fatalErrors = [self.ERROR_FATAL, self.ERROR_GENERAL, self.ERROR_NAME_NOT_RESOLVED, self.ERROR_TOO_MANY_REDIRECTS,
647  self.ERROR_PROXY_CONNECTION_FAILED, self.ERROR_CONNECTION_TIMED_OUT, self.ERROR_CONFLICT,
648  self.ERROR_TUNNEL_CONNECTION_FAILED, self.ERROR_EMPTY_RESPONSE, self.ERROR_SERVICE_UNAVAILABLE]
649 
650  # Check environment
651  # TODO: add dependecy argument pass, now reduced and hardcoded
652  checkEnv = True
653  if checkEnv:
654  # envVars = {"DISPLAY": "", "LC_ALL":"en_US.UTF-8", "LANG":"en_US.UTF-8", "LANGUAGE":"en_US.UTF-8"}
655  envVars = {"DISPLAY": "", "LANG":"en_US.UTF-8"}
656  for varName in envVars:
657  v = os.getenv(varName, "")
658  if varName == "DISPLAY":
659  if v == "":
660  raise SeleniumFetcherException("Environment variable 'DISPLAY' is not set!")
661  else:
662  if v != envVars[varName]:
663  raise SeleniumFetcherException("Environment variable '" + varName + "' value expected:'" + \
664  envVars[varName] + "', got from os: '" + v + "'; all env: " + \
665  str(os.environ))
666 
667  # Create driver instance
668  try:
669  # get chrome options
670  chrome_option = self.getOptions(webdriver, headers, proxies, url)
671 
672  # The platform-dependent path to the driver executable
673  if executable_path is None:
674  path = exec_path + driver_name + str(ctypes.sizeof(ctypes.c_voidp) * 8)
675  else:
676  path = executable_path
677  if self.logger is not None:
678  self.logger.debug("Chrome driver executable path: %s, options: %s", str(path), str(chrome_option.arguments))
679  from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
680  # enable browser logging
681  d = DesiredCapabilities.CHROME
682  d['loggingPrefs'] = {'browser':'ALL'}
683  # Get driver
684  self.driver = webdriver.Chrome(executable_path=path, chrome_options=chrome_option, desired_capabilities=d)
685  self.driverPid = self.driver.service.process.pid
686  if self.logger:
687  self.logger.debug("Driver pid: " + str(self.driverPid))
688  except Exception, err:
689  error_msg = 'Driver initialization error: ' + str(err)
690  error_code = self.ERROR_FATAL
691  except: # pylint: disable=W0702
692  error_msg = 'General driver initialization!'
693  error_code = self.ERROR_GENERAL
694 
695  if error_code > 0:
696  if self.logger is not None:
697  self.logger.error('Fatal error: ' + error_msg)
698  raise SeleniumFetcherException(error_msg)
699 
700  # Make request
701  try:
702  # driver.set_page_load_timeout(timeout * 10)
703  # driver.set_script_timeout(timeout * 10)
704  # driver.implicitly_wait(timeout * 10)
705 
706  if self.logger is not None:
707  self.logger.debug("Chrome driver get url: `%s`", str(url))
708  self.driver.get(url)
709  # Get logs
710  log_types = self.driver.log_types
711  if 'browser' in log_types:
712  log_list = self.driver.get_log('browser')
713  if self.logger is not None:
714  self.logger.debug("Driver logs: " + str(log_list))
715  for item_dict in log_list:
716  if self.logger is not None:
717  self.logger.debug("Driver message: `%s`", str(item_dict["message"]))
718  if "message" in item_dict and ((url + ' ') in item_dict["message"] or (url + '/ ') in item_dict["message"]):
719  error_msg += item_dict["message"] + " | "
720  else:
721  if self.logger is not None:
722  self.logger.error("No driver logs!")
723  if error_msg != "":
724  entrances = [
725  (r"(.*)net::ERR_NAME_NOT_RESOLVED(.*)", self.ERROR_NAME_NOT_RESOLVED),
726  (r"(.*)net::ERR_TOO_MANY_REDIRECTS(.*)", self.ERROR_TOO_MANY_REDIRECTS),
727  (r"(.*)ERR_PROXY_CONNECTION_FAILED(.*)", self.ERROR_PROXY_CONNECTION_FAILED),
728  (r"(.*)net::ERR_CONNECTION_TIMED_OUT(.*)", self.ERROR_CONNECTION_TIMED_OUT),
729  (r"(.*)net::ERR_TUNNEL_CONNECTION_FAILED(.*)", self.ERROR_TUNNEL_CONNECTION_FAILED),
730  (r"(.*)net::ERR_CONNECTION_RESET(.*)", self.ERROR_TUNNEL_CONNECTION_FAILED),
731  (r"(.*)net::ERR_INVALID_URL(.*)", self.ERROR_TUNNEL_CONNECTION_FAILED),
732  (r"(.*)net::ERR_EMPTY_RESPONSE(.*)", self.ERROR_EMPTY_RESPONSE),
733  (r"(.*)" + self.LOG_MESSAGE_RENDERRER_TIMEOUT + r"(.*)", self.ERROR_CONNECTION_TIMED_OUT),
734  (r"(.*)" + self.LOG_MESSAGE_SERVER_RESPONSE_503 + r"(.*)", self.ERROR_SERVICE_UNAVAILABLE),
735  (r"(.*)" + self.LOG_MESSAGE_SERVER_RESPONSE_409 + r"(.*)", self.ERROR_CONFLICT),
736  (r"(.*)403 \(Forbidden\)(.*)", 403),
737  (r"(.*)404 \(Not Found\)(.*)", 404),
738  (r"(.*)500 \(Internal Server Error\)(.*)", 500),
739  (r"(.*)net::(.*)", 520)]
740  for item in entrances:
741  regex = re.compile(item[0])
742  r = regex.search(error_msg)
743  if r:
744  error_code = item[1]
745  if self.logger is not None:
746  self.logger.debug("Page error: " + error_msg)
747  break
748  if error_code not in fatalErrors and inlineMacro != '':
749  if self.logger is not None:
750  self.logger.debug("Execute inline macro: %s", str(inlineMacro))
751  macroResults, errorCode, errorMsg = self.execMacroSimple([inlineMacro])
752  if error_code not in fatalErrors and macro is not None:
753  if self.logger is not None:
754  self.logger.debug("Execute macro: %s", str(macro))
755  if isinstance(macro, list):
756  macroResults, errorCode, errorMsg = self.execMacroSimple(macro)
757  else:
758  macroResults, errorCode, errorMsg, content_type_macro, result_type_macro = self.execMacroExtended(macro)
759  if errorCode > 0:
760  error_code_macro |= APP_CONSTS.ERROR_MACRO
761  error_msg = errorMsg
762  if len(macroResults) > 0:
763  if result_type_macro == self.MACRO_RESULT_TYPE_CONTENT:
764  page_source_macro = macroResults
765  else:
766  page_source_macro = json.dumps(macroResults, ensure_ascii=False) # pylint: disable=R0204
767  except Exception, err:
768  error_msg = 'Driver error: ' + str(err) + '; logs: ' + self.getAllLogsAsString()
769  error_code = self.ERROR_FATAL
770  except: # pylint: disable=W0702
771  error_msg = "General driver usage error!"
772  error_code = self.ERROR_GENERAL
773 
774  if error_code == 0:
775  if timeout > 0:
776  if self.logger is not None:
777  self.logger.debug("Wait on damping timeout to load all dynamic parts of the page: %s sec", str(timeout))
778  # Wait fixed time to load all dynamic parts of the page
779  time.sleep(timeout)
780  elif error_code in fatalErrors:
781  if self.logger is not None:
782  self.logger.debug("Fatal error, code: %s, msg: %s", str(error_code), error_msg)
783  if error_code == self.ERROR_NAME_NOT_RESOLVED:
784  code = APP_CONSTS.ERROR_FETCH_INVALID_URL
785  elif error_code == self.ERROR_TOO_MANY_REDIRECTS:
786  code = APP_CONSTS.ERROR_FETCH_TOO_MANY_REDIRECTS
787  elif error_code == self.ERROR_PROXY_CONNECTION_FAILED:
788  code = APP_CONSTS.ERROR_FETCH_CONNECTION_ERROR
789  elif error_code == self.ERROR_CONNECTION_TIMED_OUT:
790  code = APP_CONSTS.ERROR_FETCH_CONNECTION_TIMEOUT
791  elif error_code == self.ERROR_TUNNEL_CONNECTION_FAILED:
792  code = APP_CONSTS.ERROR_FETCH_FORBIDDEN
793  elif error_code == self.ERROR_EMPTY_RESPONSE:
794  code = APP_CONSTS.ERROR_EMPTY_RESPONSE
795  elif error_code == self.ERROR_SERVICE_UNAVAILABLE:
796  code = APP_CONSTS.ERROR_FETCH_FORBIDDEN
797  elif error_code == self.ERROR_CONFLICT:
798  code = APP_CONSTS.ERROR_FETCH_HTTP_ERROR
799  else:
800  code = APP_CONSTS.ERROR_FETCHER_INTERNAL
801  # self.cleanup(driver)
802  raise SeleniumFetcherException(error_msg, code)
803 
804  page_source = ""
805  cookies = {}
806  try:
807  page_source = self.driver.page_source
808  cookies = self.driver.get_cookies()
809  except Exception, err:
810  error_msg = str(err)
811  error_code = self.ERROR_CONTENT_OR_COOKIE
812  except: # pylint: disable=W0702
813  error_msg = "Content and cookies get error!"
814  error_code = self.ERROR_CONTENT_OR_COOKIE
815 
816  content_type = None
817  charset = None
818  try:
819  attr = self.driver.find_element_by_xpath(".//meta[translate(@http-equiv,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='content-type']").get_attribute("content") # pylint: disable=C0301
820  regex = re.compile(r"(.*); charset=(.*)", re.IGNORECASE)
821  items = regex.search(attr)
822  if items is not None:
823  items = items.groups()
824  if len(items) > 1:
825  content_type = items[0]
826  charset = items[1]
827  except Exception, err:
828  pass
829  if content_type is None:
830  try:
831  attr = self.driver.find_element_by_xpath('//html')
832  content_type = self.CONTENT_TYPE_HTML
833  except Exception, err:
834  pass
835  if content_type is not None and charset is None:
836  try:
837  charset = self.driver.find_element_by_xpath('//meta[@charset]').get_attribute("charset")
838  except Exception, err:
839  pass
840  if charset is None:
841  try:
842  charset = self.driver.execute_script("return document.characterSet;")
843  except Exception, err:
844  if self.logger is not None:
845  self.logger.debug("Charset detection error: %s", str(err))
846 
847  try:
848  current_url = self.driver.current_url
849  except Exception, err:
850  current_url = url
851  if self.logger is not None:
852  self.logger.debug("Get 'current_url' error: %s, input url assumed: %s", str(err), str(url))
853 
854  # if self.LOG_MESSAGE_RENDERRER_TIMEOUT in error_msg:
855  # self.cleanup(driver)
856  # raise SeleniumFetcherException(error_msg, APP_CONSTS.ERROR_FETCH_CONNECTION_TIMEOUT)
857 
858  try:
859  res = Response()
860  res.url = current_url
861  if error_code > 100 or error_code == self.ERROR_FATAL:
862  res.status_code = error_code
863  else:
864  res.status_code = 200
865  res.redirects = []
866  if page_source_macro is None:
867  res.unicode_content = page_source
868  else:
869  res.unicode_content = page_source_macro
870  res.str_content = res.unicode_content
871  res.rendered_unicode_content = res.unicode_content
872  res.content_size = len(res.unicode_content)
873  res.encoding = charset
874  res.headers = {'content-length': res.content_size}
875  if page_source_macro is not None:
876  if content_type_macro is not None:
877  content_type = content_type_macro
878  else:
879  content_type = self.CONTENT_TYPE_JSON
880  if content_type is not None:
881  res.headers['content-type'] = content_type
882  if current_url != url:
883  res.headers['location'] = current_url
884  res.meta_res = res.unicode_content
885  res.cookies = cookies
886  res.dynamic_fetcher_type = driver_name
887  res.dynamic_fetcher_result_type = result_type_macro
888  if error_code_macro != APP_CONSTS.ERROR_OK:
889  res.error_mask |= error_code_macro
890  res.time = time.time() - startTime
891  res.request = {'headers':headers} # # alexv
892  res.error_msg = error_msg
893  except Exception, err:
894  msg = 'Response fill error: ' + str(err)
895  if self.logger is not None:
896  self.logger.error(msg)
897  raise SeleniumFetcherException(msg)
898 
899  if self.logger is not None and error_msg != "":
900  self.logger.debug("Dynamic fetcher none fatal error: " + error_msg)
901 
902  return res
903 
904  except Exception, err:
905  msg = 'Unrecognized dynamic fetcher error: ' + str(err)
906  if self.logger is not None:
907  self.logger.error(msg)
908  raise SeleniumFetcherException(msg)
909 
910 
-mask-info
Here is the call graph for this function:
Here is the caller graph for this function:

◆ removeTmpDirs()

def dc_crawler.Fetcher.SeleniumFetcher.removeTmpDirs (   self,
  delay = DELAY_TERMINATE_AND_QUIT,
  tries = 3 
)

Definition at line 1316 of file Fetcher.py.

1316  def removeTmpDirs(self, delay=DELAY_TERMINATE_AND_QUIT, tries=3):
1317  if self.tmpDir != '':
1318  for i in xrange(1, tries):
1319  try:
1320  time.sleep(delay)
1321  if os.path.isdir(self.tmpDir):
1322  if self.logger is not None:
1323  self.logger.debug("Removing tmp dir: %s, try: %s", self.tmpDir, str(i))
1324  shutil.rmtree(self.tmpDir)
1325  else:
1326  break
1327  except Exception, err:
1328  if self.logger is not None:
1329  self.logger.debug("Remove tmp dir: %s, try: %s, error: %s", self.tmpDir, str(i), str(err))
1330 
1331 
Here is the caller graph for this function:

◆ should_have_meta_res()

def dc_crawler.Fetcher.SeleniumFetcher.should_have_meta_res (   self)

Definition at line 1041 of file Fetcher.py.

1041  def should_have_meta_res(self):
1042  return True
1043 
1044 

Member Data Documentation

◆ CHROME_DEBUG_LOG_NAME

string dc_crawler.Fetcher.SeleniumFetcher.CHROME_DEBUG_LOG_NAME = 'chrome_debug.log'
static

Definition at line 423 of file Fetcher.py.

◆ CHROME_DIRS_TEMPLATE

string dc_crawler.Fetcher.SeleniumFetcher.CHROME_DIRS_TEMPLATE = '.google.Chrome.'
static

Definition at line 422 of file Fetcher.py.

◆ CHROME_PROCESS_NAMES

list dc_crawler.Fetcher.SeleniumFetcher.CHROME_PROCESS_NAMES = ['chrome', 'BrowserBlocking']
static

Definition at line 421 of file Fetcher.py.

◆ CONTENT_TYPE_HTML

string dc_crawler.Fetcher.SeleniumFetcher.CONTENT_TYPE_HTML = 'text/html'
static

Definition at line 401 of file Fetcher.py.

◆ CONTENT_TYPE_JSON

string dc_crawler.Fetcher.SeleniumFetcher.CONTENT_TYPE_JSON = 'text/json'
static

Definition at line 400 of file Fetcher.py.

◆ DEFAUIL_TIMEOUT

int dc_crawler.Fetcher.SeleniumFetcher.DEFAUIL_TIMEOUT = 5
static

Definition at line 399 of file Fetcher.py.

◆ DELAY_TERMINATE_AND_QUIT

float dc_crawler.Fetcher.SeleniumFetcher.DELAY_TERMINATE_AND_QUIT = 0.5
static

Definition at line 402 of file Fetcher.py.

◆ driver

dc_crawler.Fetcher.SeleniumFetcher.driver

Definition at line 478 of file Fetcher.py.

◆ driverPid

dc_crawler.Fetcher.SeleniumFetcher.driverPid

Definition at line 479 of file Fetcher.py.

◆ ERROR_CONFLICT

int dc_crawler.Fetcher.SeleniumFetcher.ERROR_CONFLICT = 409
static

Definition at line 414 of file Fetcher.py.

◆ ERROR_CONNECTION_TIMED_OUT

int dc_crawler.Fetcher.SeleniumFetcher.ERROR_CONNECTION_TIMED_OUT = 505
static

Definition at line 411 of file Fetcher.py.

◆ ERROR_CONTENT_OR_COOKIE

int dc_crawler.Fetcher.SeleniumFetcher.ERROR_CONTENT_OR_COOKIE = 3
static

Definition at line 406 of file Fetcher.py.

◆ ERROR_EMPTY_RESPONSE

int dc_crawler.Fetcher.SeleniumFetcher.ERROR_EMPTY_RESPONSE = 13
static

Definition at line 415 of file Fetcher.py.

◆ ERROR_FATAL

int dc_crawler.Fetcher.SeleniumFetcher.ERROR_FATAL = 1
static

Definition at line 404 of file Fetcher.py.

◆ ERROR_GENERAL

int dc_crawler.Fetcher.SeleniumFetcher.ERROR_GENERAL = 2
static

Definition at line 405 of file Fetcher.py.

◆ ERROR_MACRO_RETURN_VALUE

int dc_crawler.Fetcher.SeleniumFetcher.ERROR_MACRO_RETURN_VALUE = 12
static

Definition at line 409 of file Fetcher.py.

◆ ERROR_NAME_NOT_RESOLVED

int dc_crawler.Fetcher.SeleniumFetcher.ERROR_NAME_NOT_RESOLVED = 400
static

Definition at line 407 of file Fetcher.py.

◆ ERROR_PROXY_CONNECTION_FAILED

int dc_crawler.Fetcher.SeleniumFetcher.ERROR_PROXY_CONNECTION_FAILED = 504
static

Definition at line 410 of file Fetcher.py.

◆ ERROR_SERVICE_UNAVAILABLE

int dc_crawler.Fetcher.SeleniumFetcher.ERROR_SERVICE_UNAVAILABLE = 503
static

Definition at line 413 of file Fetcher.py.

◆ ERROR_TOO_MANY_REDIRECTS

int dc_crawler.Fetcher.SeleniumFetcher.ERROR_TOO_MANY_REDIRECTS = 11
static

Definition at line 408 of file Fetcher.py.

◆ ERROR_TUNNEL_CONNECTION_FAILED

int dc_crawler.Fetcher.SeleniumFetcher.ERROR_TUNNEL_CONNECTION_FAILED = 403
static

Definition at line 412 of file Fetcher.py.

◆ inlineURLMacroDelimiter

dc_crawler.Fetcher.SeleniumFetcher.inlineURLMacroDelimiter

Definition at line 480 of file Fetcher.py.

◆ LOG_MESSAGE_RENDERRER_TIMEOUT

string dc_crawler.Fetcher.SeleniumFetcher.LOG_MESSAGE_RENDERRER_TIMEOUT = 'Timed out receiving message from renderer'
static

Definition at line 417 of file Fetcher.py.

◆ LOG_MESSAGE_SERVER_RESPONSE_409

string dc_crawler.Fetcher.SeleniumFetcher.LOG_MESSAGE_SERVER_RESPONSE_409 = 'server responded with a status of 409 (Conflict)'
static

Definition at line 419 of file Fetcher.py.

◆ LOG_MESSAGE_SERVER_RESPONSE_503

string dc_crawler.Fetcher.SeleniumFetcher.LOG_MESSAGE_SERVER_RESPONSE_503 = 'server responded with a status of 503'
static

Definition at line 418 of file Fetcher.py.

◆ logger

dc_crawler.Fetcher.SeleniumFetcher.logger

Definition at line 441 of file Fetcher.py.

◆ MACRO_RESULT_TYPE_AUTO

int dc_crawler.Fetcher.SeleniumFetcher.MACRO_RESULT_TYPE_AUTO = 3
static

Definition at line 428 of file Fetcher.py.

◆ MACRO_RESULT_TYPE_CONTENT

int dc_crawler.Fetcher.SeleniumFetcher.MACRO_RESULT_TYPE_CONTENT = 2
static

Definition at line 427 of file Fetcher.py.

◆ MACRO_RESULT_TYPE_DEFAULT

int dc_crawler.Fetcher.SeleniumFetcher.MACRO_RESULT_TYPE_DEFAULT = 0
static

Definition at line 425 of file Fetcher.py.

◆ MACRO_RESULT_TYPE_URLS_LIST

int dc_crawler.Fetcher.SeleniumFetcher.MACRO_RESULT_TYPE_URLS_LIST = 1
static

Definition at line 426 of file Fetcher.py.

◆ sessionId

dc_crawler.Fetcher.SeleniumFetcher.sessionId

Definition at line 481 of file Fetcher.py.

◆ TMP_DIR_TYPE_INSTANTIATE

int dc_crawler.Fetcher.SeleniumFetcher.TMP_DIR_TYPE_INSTANTIATE = 1
static

Definition at line 431 of file Fetcher.py.

◆ TMP_DIR_TYPE_OPEN

int dc_crawler.Fetcher.SeleniumFetcher.TMP_DIR_TYPE_OPEN = 0
static

Definition at line 430 of file Fetcher.py.

◆ tmpDir

dc_crawler.Fetcher.SeleniumFetcher.tmpDir

Definition at line 465 of file Fetcher.py.

◆ tmpDirPath

dc_crawler.Fetcher.SeleniumFetcher.tmpDirPath

Definition at line 446 of file Fetcher.py.

◆ tmpDirPrefix

dc_crawler.Fetcher.SeleniumFetcher.tmpDirPrefix

Definition at line 447 of file Fetcher.py.

◆ tmpDirRemoveBeforeCreate

dc_crawler.Fetcher.SeleniumFetcher.tmpDirRemoveBeforeCreate

Definition at line 450 of file Fetcher.py.

◆ tmpDirSuffix

dc_crawler.Fetcher.SeleniumFetcher.tmpDirSuffix

Definition at line 448 of file Fetcher.py.

◆ tmpDirType

dc_crawler.Fetcher.SeleniumFetcher.tmpDirType

Definition at line 449 of file Fetcher.py.

◆ userDataDirUsed

dc_crawler.Fetcher.SeleniumFetcher.userDataDirUsed

Definition at line 482 of file Fetcher.py.


The documentation for this class was generated from the following file: