2 HCE project, Python bindings, Distributed Tasks Manager application. 7 @author madk, bgv <developers.hce@gmail.com> 8 @link: http://hierarchical-cluster-engine.com/ 9 @copyright: Copyright © 2013-201 IOIX Ukraine 10 @license: http://hierarchical-cluster-engine.com/license/ 28 from random
import randint
29 from urlparse
import urlsplit
32 import requests.exceptions
33 from requests.auth
import HTTPBasicAuth
44 logger = logging.getLogger(APP_CONSTS.LOGGER_NAME)
47 MAX_CONTENT_SIZE_FOR_CHARDET = 5000000
60 CONNECTION_TIMEOUT = 1.0
71 def init(dbWrapper=None, siteId=None):
73 BaseFetcher.prohibited_conten_types = [
"audio/mpeg",
"application/pdf"]
75 BaseFetcher.fetchers = {
103 allowed_content_types=None,
104 max_resource_size=None,
105 max_redirects=CONSTS.MAX_HTTP_REDIRECTS_LIMIT,
107 executable_path=None,
112 del url, method, headers, timeout, allow_redirects, proxies, auth, data, log, allowed_content_types, \
113 max_resource_size, max_redirects, filters, executable_path, depth, macro
122 if not BaseFetcher.fetchers:
123 BaseFetcher.init(dbWrapper, siteId)
124 if typ
in BaseFetcher.fetchers:
125 return BaseFetcher.fetchers[typ]
127 raise BaseException(
"unsupported fetch type:%s" % (typ,))
145 urlParts = urlsplit(url)
146 if len(urlParts) > 1:
156 logger.debug(
'r.url = ' + str(r.url))
157 logger.debug(
'args = ' + str(args))
158 logger.debug(
'kwargs = ' + str(kwargs))
159 logger.debug(
'type(r): %s, r = %s', str(
type(r)),
varDump(r))
168 BaseFetcher.__init__(self)
190 allow_redirects=True,
195 allowed_content_types=None,
196 max_resource_size=None,
197 max_redirects=CONSTS.MAX_HTTP_REDIRECTS_LIMIT,
199 executable_path=None,
204 log = logger
if log
is None else log
207 for key
in headers.keys():
208 if not key.startswith(
'--'):
209 headers1[key] = headers[key]
212 if not isinstance(timeout, tuple):
213 if hasattr(self,
'connectionTimeout'):
219 auth = HTTPBasicAuth(auth[0], auth[1])
222 if proxies
is not None:
223 proxy_type, proxy_host, proxy_port, proxy_user, proxy_passwd = proxies
224 if proxy_type
is None:
226 if proxy_user
is not None:
227 proxies =
"%s://%s:%s@%s:%s" % (proxy_type, proxy_user, proxy_passwd, proxy_host, proxy_port)
229 proxies =
"%s://%s:%s" % (proxy_type, proxy_host, proxy_port)
230 proxy_setting = {
"http" : proxies}
237 impl_res = requestsRedirect.request(url=url,
241 allowRedirects=allow_redirects,
242 proxySetting=proxy_setting,
245 maxRedirects=max_redirects,
248 log.debug(
"!!! impl_res.headers: %s",
varDump(impl_res.headers))
249 log.debug(
"!!! impl_res.url: %s", str(impl_res.url))
251 location = impl_res.url
252 headers = dict(impl_res.headers.lower_items())
255 if "content-length" in impl_res.headers
and \
256 max_resource_size != CONSTS.MAX_HTTP_SIZE_UNLIMIT
and \
257 int(impl_res.headers[
'content-length']) > max_resource_size:
258 log.debug(
"Content size overshooted. content-length: %s, max_resource_size: %s" % \
259 (str(impl_res.headers[
'content-length']), str(max_resource_size)))
260 res.content_size = int(impl_res.headers[
'content-length'])
262 ct = impl_res.headers.get(
'content-type',
'').lower()
264 if ct.startswith(
'application')
or ct.startswith(
'audio')
or \
265 len(impl_res.content) >= MAX_CONTENT_SIZE_FOR_CHARDET:
268 log.debug(
"encoding3=%s", str(encoding))
269 if encoding
is not None:
270 impl_res.encoding = encoding
272 detected_encoding = impl_res.encoding
273 log.debug(
"Headers contains 'application' or 'audio' content-type: %s",
274 impl_res.headers.get(
'content-type',
''))
278 log.debug(
"impl_res.encoding1=%s, content-type=%s", impl_res.encoding, ct)
282 log.debug(
"Using the SimpleCharsetDetector()")
284 log.debug(
"encoding=%s", str(encoding))
285 if encoding
is not None:
286 impl_res.encoding = encoding
290 log.debug(
"encoding3=%s", str(encoding))
291 if encoding
is not None:
292 impl_res.encoding = encoding
295 if (impl_res.encoding
is None)
or ((encoding
is None)
and (impl_res.encoding
not in ct
and "xml" not in ct)):
296 log.debug(
"Using the charset to improve encoding detect")
297 detected_encoding = impl_res.apparent_encoding
298 if detected_encoding !=
'ascii' and detected_encoding !=
'ISO-8859-2':
299 impl_res.encoding = detected_encoding
300 log.debug(
"impl_res.encoding2=%s", impl_res.encoding)
303 if impl_res.headers.get(
'content-type',
'').startswith(
'application'):
304 res.unicode_content = impl_res.content
306 res.unicode_content = text_buffer
307 res.str_content = impl_res.content
308 if impl_res.headers.get(
'content-type',
'').startswith(
'application'):
309 res.rendered_unicode_content = impl_res.content
311 res.rendered_unicode_content = text_buffer
313 if res.rendered_unicode_content
is None:
316 res.content_size = len(res.rendered_unicode_content)
318 res.headers = impl_res.headers
319 res.redirects = impl_res.history
320 res.status_code = impl_res.status_code
321 res.url = impl_res.url
322 res.encoding = impl_res.encoding
323 res.request = impl_res.request
324 res.cookies = requests.utils.dict_from_cookiejar(impl_res.cookies)
327 res.headers.update({
'Location':location})
329 except (requests.exceptions.Timeout, requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout), err:
330 res.error_mask = APP_CONSTS.ERROR_REQUEST_TIMEOUT
331 msg =
"Requests fetcher has thrown '%s' exception: " % str(
type(err))
332 if isinstance(err, requests.exceptions.Timeout):
333 msg +=
"The request timed out." 334 elif isinstance(err, requests.exceptions.ReadTimeout):
335 msg +=
"The server did not send any data in the allotted amount of time." 336 elif isinstance(err, requests.exceptions.ConnectTimeout):
337 msg +=
"The request timed out while trying to connect to the remote server." 341 except requests.exceptions.ConnectionError, err:
342 res.error_mask = APP_CONSTS.ERROR_FETCH_CONNECTION_ERROR
343 log.debug(
">>> Requests fetcher has thrown ConnectionError exception: " + str(err))
345 except requests.exceptions.HTTPError, err:
346 res.error_mask = APP_CONSTS.ERROR_FETCH_HTTP_ERROR
347 log.debug(
">>> Requests fetcher has thrown HTTPError exception: " + str(err))
349 except requests.exceptions.URLRequired, err:
350 res.error_mask = APP_CONSTS.ERROR_FETCH_INVALID_URL
351 log.debug(
">>> Requests fetcher has thrown URLRequired exception: " + str(err))
353 except requests.exceptions.TooManyRedirects, err:
354 res.error_mask = APP_CONSTS.ERROR_FETCH_TOO_MANY_REDIRECTS
355 log.debug(
">>> Requests fetcher has thrown TooManyRedirects exception: " + str(err))
357 except requests.exceptions.RequestException, err:
358 res.error_mask = APP_CONSTS.ERROR_FETCH_AMBIGUOUS_REQUEST
359 log.debug(
">>> Requests fetcher has thrown RequestException exception: " + str(err))
361 except CrawlerFilterException, err:
362 res.error_mask = APP_CONSTS.ERROR_CRAWLER_FILTERS_BREAK
363 log.debug(
"Crawler has not allowed filter: " + str(err))
365 except Exception, err:
366 res.error_mask = APP_CONSTS.ERROR_FETCHER_INTERNAL
367 log.debug(
">>> Requests fetcher has thrown exception" + \
368 " type: " + str(
type(err)) +
"\n" + Utils.getTracebackInfo())
382 if contentStr.startswith(
'<?xml ')
and '<html' in contentStr
and '<head' in contentStr:
384 p = re.compile(
r'<\?xml .*\?>')
385 contentStr = p.sub(
'', contentStr, count=1)
400 CONTENT_TYPE_JSON =
'text/json' 401 CONTENT_TYPE_HTML =
'text/html' 402 DELAY_TERMINATE_AND_QUIT = 0.5
406 ERROR_CONTENT_OR_COOKIE = 3
407 ERROR_NAME_NOT_RESOLVED = 400
408 ERROR_TOO_MANY_REDIRECTS = 11
409 ERROR_MACRO_RETURN_VALUE = 12
410 ERROR_PROXY_CONNECTION_FAILED = 504
411 ERROR_CONNECTION_TIMED_OUT = 505
412 ERROR_TUNNEL_CONNECTION_FAILED = 403
413 ERROR_SERVICE_UNAVAILABLE = 503
415 ERROR_EMPTY_RESPONSE = 13
417 LOG_MESSAGE_RENDERRER_TIMEOUT =
'Timed out receiving message from renderer' 418 LOG_MESSAGE_SERVER_RESPONSE_503 =
'server responded with a status of 503' 419 LOG_MESSAGE_SERVER_RESPONSE_409 =
'server responded with a status of 409 (Conflict)' 421 CHROME_PROCESS_NAMES = [
'chrome',
'BrowserBlocking']
422 CHROME_DIRS_TEMPLATE =
'.google.Chrome.' 423 CHROME_DEBUG_LOG_NAME =
'chrome_debug.log' 425 MACRO_RESULT_TYPE_DEFAULT = 0
426 MACRO_RESULT_TYPE_URLS_LIST = 1
427 MACRO_RESULT_TYPE_CONTENT = 2
428 MACRO_RESULT_TYPE_AUTO = 3
430 TMP_DIR_TYPE_OPEN = 0
431 TMP_DIR_TYPE_INSTANTIATE = 1
438 super(SeleniumFetcher, self).
__init__()
444 self.
logger.debug(
"Initialization of instance, tmpDirOptions: %s", str(tmpDirOptions))
451 if tmpDirOptions
is not None:
452 if 'path' in tmpDirOptions:
454 if 'prefix' in tmpDirOptions:
456 if 'suffix' in tmpDirOptions:
458 if 'type' in tmpDirOptions:
460 if 'remove_before_create' in tmpDirOptions:
462 pid = str(os.getpid()).strip()
471 msg =
'Temporary directory type INSTANTIATE `%s` initialization error!', self.
tmpDir 472 if self.
logger is not None:
476 if self.
logger is not None:
477 self.
logger.debug(
"Temporary directory type INSTANTIATE `%s` initialized!", self.
tmpDir)
489 self.
logger.debug(
"Delete instance, temporary dir type: %s", str(self.
tmpDirType))
515 timeout=DEFAUIL_TIMEOUT,
516 allow_redirects=True,
521 allowed_content_types=None,
522 max_resource_size=None,
525 executable_path=None,
532 if self.
logger is not None:
533 self.
logger.debug(
"Dynamic fetcher call:\nurl:" + str(url) + \
534 "\nmethod:" + str(method) +
"\nheaders:" + str(headers) +
"\ntimeout:" + str(timeout) + \
535 "\nallow_redirects:" + str(allow_redirects) +
"\nproxies:" + str(proxies) +
"\nauth:" + \
536 str(auth) +
"\ndata:" + str(data) +
"\nlogger:" + str(self.
logger) + \
537 "\nallowed_content_types:" + str(allowed_content_types) +
"\nmax_resource_size:" + \
538 str(max_resource_size) +
"\nmax_redirects:" + str(max_redirects) +
"\nexecutable_path:" + \
539 str(executable_path) +
"\ncur_dir:" + str(os.getcwd()) +
"\nmacro:" + str(macro))
542 if isinstance(timeout, tuple):
544 if isinstance(timeout[0], float):
545 t1 = int(str(timeout[0]).strip()[str(timeout[0]).strip().find(
'.') + 1:])
548 if isinstance(timeout, float):
549 t1 = int(str(timeout).strip()[str(timeout).strip().find(
'.') + 1:])
550 if self.
logger is not None:
551 self.
logger.debug(
"Execution timeout: %s, damping timeout: %s", str(t), str(t1))
553 msg =
"Execution timeout: %s less or equal than damping timeout: %s, aborted" % (str(t), str(t1))
554 if self.
logger is not None:
560 msg =
'Temporary directory type OPEN `%s` initialization error!' % self.
tmpDir 561 if self.
logger is not None:
565 if self.
logger is not None:
566 self.
logger.debug(
'Temporary directory type OPEN `%s` initialized', self.
tmpDir)
571 timeout=t, log=self.
logger)
573 if self.
logger is not None:
574 msg =
'Execution timeout: ' + str(t) +
' reached!' 577 except SeleniumFetcherException, err:
578 if self.
logger is not None:
579 self.
logger.
error(
"Error SeleniumFetcherException: %s", str(err))
582 except Exception, err:
583 if self.
logger is not None:
584 msg =
'Execution with timeout error:' + str(err)
591 if self.
logger is not None:
592 self.
logger.debug(
"Dynamic fetcher call finished normally.")
607 def openT(self, url, headers, timeout, proxies, executable_path, macro):
608 startTime = time.time()
619 from selenium
import webdriver
620 import selenium.webdriver.support.ui
621 except Exception, err:
622 msg =
'Selenium module import error: ' + str(err)
623 if self.
logger is not None:
627 if self.
logger is not None:
629 from selenium.webdriver.remote.remote_connection
import LOGGER
as seleniumLogger
630 seleniumLogger.setLevel(self.
logger.getEffectiveLevel())
632 selenium_logger = logging.getLogger(
'selenium.webdriver.remote.remote_connection')
635 selenium_logger.setLevel(self.
logger.getEffectiveLevel())
639 driver_name =
"chromedriver" 643 page_source_macro =
None 644 content_type_macro =
None 655 envVars = {
"DISPLAY":
"",
"LANG":
"en_US.UTF-8"}
656 for varName
in envVars:
657 v = os.getenv(varName,
"")
658 if varName ==
"DISPLAY":
662 if v != envVars[varName]:
664 envVars[varName] +
"', got from os: '" + v +
"'; all env: " + \
670 chrome_option = self.
getOptions(webdriver, headers, proxies, url)
673 if executable_path
is None:
674 path = exec_path + driver_name + str(ctypes.sizeof(ctypes.c_voidp) * 8)
676 path = executable_path
677 if self.
logger is not None:
678 self.
logger.debug(
"Chrome driver executable path: %s, options: %s", str(path), str(chrome_option.arguments))
679 from selenium.webdriver.common.desired_capabilities
import DesiredCapabilities
681 d = DesiredCapabilities.CHROME
682 d[
'loggingPrefs'] = {
'browser':
'ALL'}
684 self.
driver = webdriver.Chrome(executable_path=path, chrome_options=chrome_option, desired_capabilities=d)
688 except Exception, err:
689 error_msg =
'Driver initialization error: ' + str(err)
692 error_msg =
'General driver initialization!' 696 if self.
logger is not None:
706 if self.
logger is not None:
707 self.
logger.debug(
"Chrome driver get url: `%s`", str(url))
710 log_types = self.
driver.log_types
711 if 'browser' in log_types:
712 log_list = self.
driver.get_log(
'browser')
713 if self.
logger is not None:
714 self.
logger.debug(
"Driver logs: " + str(log_list))
715 for item_dict
in log_list:
716 if self.
logger is not None:
717 self.
logger.debug(
"Driver message: `%s`", str(item_dict[
"message"]))
718 if "message" in item_dict
and ((url +
' ')
in item_dict[
"message"]
or (url +
'/ ')
in item_dict[
"message"]):
719 error_msg += item_dict[
"message"] +
" | " 721 if self.
logger is not None:
736 (
r"(.*)403 \(Forbidden\)(.*)", 403),
737 (
r"(.*)404 \(Not Found\)(.*)", 404),
738 (
r"(.*)500 \(Internal Server Error\)(.*)", 500),
739 (
r"(.*)net::(.*)", 520)]
740 for item
in entrances:
741 regex = re.compile(item[0])
742 r = regex.search(error_msg)
745 if self.
logger is not None:
746 self.
logger.debug(
"Page error: " + error_msg)
748 if error_code
not in fatalErrors
and inlineMacro !=
'':
749 if self.
logger is not None:
750 self.
logger.debug(
"Execute inline macro: %s", str(inlineMacro))
751 macroResults, errorCode, errorMsg = self.
execMacroSimple([inlineMacro])
752 if error_code
not in fatalErrors
and macro
is not None:
753 if self.
logger is not None:
754 self.
logger.debug(
"Execute macro: %s", str(macro))
755 if isinstance(macro, list):
758 macroResults, errorCode, errorMsg, content_type_macro, result_type_macro = self.
execMacroExtended(macro)
760 error_code_macro |= APP_CONSTS.ERROR_MACRO
762 if len(macroResults) > 0:
764 page_source_macro = macroResults
766 page_source_macro = json.dumps(macroResults, ensure_ascii=
False)
767 except Exception, err:
771 error_msg =
"General driver usage error!" 776 if self.
logger is not None:
777 self.
logger.debug(
"Wait on damping timeout to load all dynamic parts of the page: %s sec", str(timeout))
780 elif error_code
in fatalErrors:
781 if self.
logger is not None:
782 self.
logger.debug(
"Fatal error, code: %s, msg: %s", str(error_code), error_msg)
784 code = APP_CONSTS.ERROR_FETCH_INVALID_URL
786 code = APP_CONSTS.ERROR_FETCH_TOO_MANY_REDIRECTS
788 code = APP_CONSTS.ERROR_FETCH_CONNECTION_ERROR
790 code = APP_CONSTS.ERROR_FETCH_CONNECTION_TIMEOUT
792 code = APP_CONSTS.ERROR_FETCH_FORBIDDEN
794 code = APP_CONSTS.ERROR_EMPTY_RESPONSE
796 code = APP_CONSTS.ERROR_FETCH_FORBIDDEN
798 code = APP_CONSTS.ERROR_FETCH_HTTP_ERROR
800 code = APP_CONSTS.ERROR_FETCHER_INTERNAL
807 page_source = self.
driver.page_source
808 cookies = self.
driver.get_cookies()
809 except Exception, err:
813 error_msg =
"Content and cookies get error!" 819 attr = self.
driver.find_element_by_xpath(
".//meta[translate(@http-equiv,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='content-type']").get_attribute(
"content")
820 regex = re.compile(
r"(.*); charset=(.*)", re.IGNORECASE)
821 items = regex.search(attr)
822 if items
is not None:
823 items = items.groups()
825 content_type = items[0]
827 except Exception, err:
829 if content_type
is None:
831 attr = self.
driver.find_element_by_xpath(
'//html')
833 except Exception, err:
835 if content_type
is not None and charset
is None:
837 charset = self.
driver.find_element_by_xpath(
'//meta[@charset]').get_attribute(
"charset")
838 except Exception, err:
842 charset = self.
driver.execute_script(
"return document.characterSet;")
843 except Exception, err:
844 if self.
logger is not None:
845 self.
logger.debug(
"Charset detection error: %s", str(err))
848 current_url = self.
driver.current_url
849 except Exception, err:
851 if self.
logger is not None:
852 self.
logger.debug(
"Get 'current_url' error: %s, input url assumed: %s", str(err), str(url))
860 res.url = current_url
861 if error_code > 100
or error_code == self.
ERROR_FATAL:
862 res.status_code = error_code
864 res.status_code = 200
866 if page_source_macro
is None:
867 res.unicode_content = page_source
869 res.unicode_content = page_source_macro
870 res.str_content = res.unicode_content
871 res.rendered_unicode_content = res.unicode_content
872 res.content_size = len(res.unicode_content)
873 res.encoding = charset
874 res.headers = {
'content-length': res.content_size}
875 if page_source_macro
is not None:
876 if content_type_macro
is not None:
877 content_type = content_type_macro
880 if content_type
is not None:
881 res.headers[
'content-type'] = content_type
882 if current_url != url:
883 res.headers[
'location'] = current_url
884 res.meta_res = res.unicode_content
885 res.cookies = cookies
886 res.dynamic_fetcher_type = driver_name
887 res.dynamic_fetcher_result_type = result_type_macro
888 if error_code_macro != APP_CONSTS.ERROR_OK:
889 res.error_mask |= error_code_macro
890 res.time = time.time() - startTime
891 res.request = {
'headers':headers}
892 res.error_msg = error_msg
893 except Exception, err:
894 msg =
'Response fill error: ' + str(err)
895 if self.
logger is not None:
899 if self.
logger is not None and error_msg !=
"":
900 self.
logger.debug(
"Dynamic fetcher none fatal error: " + error_msg)
904 except Exception, err:
905 msg =
'Unrecognized dynamic fetcher error: ' + str(err)
906 if self.
logger is not None:
916 if self.
logger is not None and '--log-chrome-debug-log' in headers:
919 with
open(logFile,
'r') as f: 921 self.logger.debug("Chrome debug log file `%s`:\n%s", logFile, logData)
922 except Exception, err:
923 self.
logger.debug(
"Error read chrome debug log file `%s`: %s", logFile, str(err))
925 if self.
logger is not None:
926 self.
logger.debug(
"Cleanup type: %s, driver: %s", str(state), str(self.
driver))
929 if self.
driver is not None:
938 if self.
driver is not None:
962 self.
logger.debug(
"Chrome processes cleanup started")
968 if '--disk-cache-dir' in headers:
969 key =
'--disk-cache-dir=' + headers[
'--disk-cache-dir']
970 if '--profile-directory' in headers:
971 key =
'--profile-directory=' + headers[
'--profile-directory']
972 if '--user-data-dir' in headers:
973 key =
'--user-data-dir=' + headers[
'--user-data-dir']
978 for proc
in psutil.process_iter():
984 if name
in proc.name():
986 for item
in proc.cmdline():
992 self.
logger.debug(
"Chrome process killing, pid:%s, cmdline: %s", str(proc.pid), str(proc.cmdline()))
994 except Exception, err:
996 self.
logger.debug(
"Chrome process kill error: %s", str(err))
997 except Exception, err:
999 self.
logger.debug(
"Chrome process kill error: %s", str(err))
1002 def killProcess(self, pid, dirsTemplate=CHROME_DIRS_TEMPLATE, dirDeleteBeforeTimeout=DELAY_TERMINATE_AND_QUIT):
1003 del dirsTemplate, dirDeleteBeforeTimeout
1006 self.
logger.debug(
"Try to Kill process pid: %s", str(pid))
1007 process = psutil.Process(pid)
1008 for proc
in process.children(recursive=
True):
1010 self.
logger.debug(
"Killing child process pid: %s", str(proc.pid))
1020 except Exception, err:
1022 self.
logger.debug(
"Child process pid: %s kill error: ", str(pid), str(err))
1024 self.
logger.debug(
"Killing main process pid: %s", str(process.pid))
1033 except Exception, err:
1035 self.
logger.debug(
"Process pid: %s kill error: %s", str(pid), str(err))
1053 for f
in process.open_files():
1056 fp = f.path.split(
'/')
1058 templateFound =
False 1061 if dirsTemplate
is not None and dirsTemplate !=
'' and dirsTemplate
in item:
1062 templateFound =
True 1064 if templateFound
or dirsTemplate
is None or dirsTemplate ==
'':
1077 chrome_option = webdriver.ChromeOptions()
1079 arg_disable_http_cache =
"--disable-http-cache" 1080 arg_clear_data_reduction_proxy_data_savings =
'--clear-data-reduction-proxy-data-savings' 1081 arg_host_resolver_retry_attempts =
'--host-resolver-retry-attempts=0' 1082 arg_start_maximized =
'--start-maximized' 1083 if headers
is not None and '--use-mobile-user-agent' in headers:
1084 use_mobile_user_agent =
'--use-mobile-user-agent' 1086 use_mobile_user_agent =
None 1088 if headers
is not None and '--disable-web-security' in headers:
1089 disable_web_security =
'--disable-web-security' 1091 disable_web_security =
None 1094 if headers
is not None and '--allow-running-insecure-content' in headers:
1095 allow_running_insecure_content =
'--allow-running-insecure-content' 1097 allow_running_insecure_content =
None 1100 if headers
is not None and '--allow-file-access-from-files' in headers:
1101 allow_file_access_from_files =
'--allow-file-access-from-files' 1103 allow_file_access_from_files =
None 1106 if headers
is not None and '--proxy-bypass-list' in headers:
1107 arg_proxy_bypass_list =
'--proxy-bypass-list=' + headers[
'--proxy-bypass-list']
1109 arg_proxy_bypass_list =
None 1112 if headers
is not None and 'User-Agent' in headers
and\
1113 '--use-mobile-user-agent' not in headers:
1114 arg_user_agent =
'--user-agent=' + headers[
'User-Agent']
1116 arg_user_agent =
None 1117 if '--disk-cache-dir' in headers:
1118 if os.path.isdir(headers[
'--disk-cache-dir']):
1119 arg_disk_cache_dir =
'--disk-cache-dir=' + headers[
'--disk-cache-dir']
1122 self.
logger.debug(
"Header `--disk-cache-dir` directory: `%s` not found!", headers[
'--disk-cache-dir'])
1124 arg_disk_cache_dir =
None 1125 if '--profile-directory' in headers:
1126 if os.path.isdir(headers[
'--profile-directory']):
1127 arg_profile_directory =
'--profile-directory=' + headers[
'--profile-directory']
1130 self.
logger.debug(
"Header `--profile-directory` directory: `%s` not found!", headers[
'--profile-directory'])
1132 arg_profile_directory =
None 1133 if '--user-data-dir' in headers:
1134 if os.path.isdir(headers[
'--user-data-dir']):
1135 arg_user_data_dir =
'--user-data-dir=' + headers[
'--user-data-dir']
1139 self.
logger.debug(
"Header `--user-data-dir` directory: `%s` not found!", headers[
'--user-data-dir'])
1142 arg_user_data_dir =
'--user-data-dir=' + self.
tmpDir 1145 arg_user_data_dir =
None 1151 self.
logger.debug(
"Profile archive user data dir `%s` not found, trying to create...",
1155 except Exception, err:
1157 self.
logger.debug(
"Profile archive user data dir creation error: %s", str(err))
1164 profiles = [p.strip()
for p
in headers[
'--user-data-dir-zip'].split(
',')
if p.strip() !=
'']
1165 if '--user-data-dir-zip-rotation' in headers
and headers[
'--user-data-dir-zip-rotation']
is not None and\
1166 headers[
'--user-data-dir-zip-rotation'] !=
'':
1167 rotationType = int(headers[
'--user-data-dir-zip'])
1171 if len(profiles) > 1:
1172 if rotationType == 0:
1173 r = [randint(0, len(profiles) - 1)
for p
in range(0, len(profiles) - 1)]
1175 elif rotationType == 1:
1177 elif rotationType == 2:
1184 res = Utils.executeCommand(
'unzip -qq ' + profiles[profileIndex] +
' -d ' + self.
userDataDirUsed)
1185 if res.exitCode != APP_CONSTS.EXIT_SUCCESS:
1186 raise Exception(str(res.stderr))
1189 os.path.splitext(os.path.basename(profiles[profileIndex]))[0] + \
1191 if res.exitCode != APP_CONSTS.EXIT_SUCCESS:
1192 raise Exception(str(res.stderr))
1195 self.
logger.debug(
"Profile archive `%s` extracted to `%s` directory, rotation: %s",
1197 except Exception, err:
1200 raise Exception(
"Profile archive extraction error: %s" % str(err))
1203 d = {
'--user-data-dir-zip in headers':str(
'--user-data-dir-zip' in headers),
1205 'os.path.isdir(self.userDataDirUsed)':str(os.path.isdir(self.
userDataDirUsed))}
1207 self.
logger.debug(
"Profile archive not used, condition data:\n%s", str(d))
1209 arg_dns_prefetch_disable =
'--dns-prefetch-disable' 1214 if headers
is not None and '--disable-setuid-sandbox' in headers:
1215 chrome_option.add_argument(
'--disable-setuid-sandbox')
1217 if headers
is not None and '--no-sandbox' in headers:
1218 chrome_option.add_argument(
'--no-sandbox')
1220 if headers
is not None and '--incognito' in headers:
1221 chrome_option.add_argument(
'--incognito')
1227 if arg_user_agent
is not None:
1228 chrome_option.add_argument(arg_user_agent)
1229 if use_mobile_user_agent
is not None:
1230 chrome_option.add_argument(use_mobile_user_agent)
1231 if disable_web_security
is not None:
1232 chrome_option.add_argument(disable_web_security)
1233 if allow_running_insecure_content
is not None:
1234 chrome_option.add_argument(allow_running_insecure_content)
1235 if allow_file_access_from_files
is not None:
1236 chrome_option.add_argument(allow_file_access_from_files)
1237 if arg_proxy_bypass_list
is not None:
1238 chrome_option.add_argument(arg_proxy_bypass_list)
1239 chrome_option.add_argument(arg_disable_http_cache)
1240 chrome_option.add_argument(arg_clear_data_reduction_proxy_data_savings)
1241 chrome_option.add_argument(arg_host_resolver_retry_attempts)
1242 chrome_option.add_argument(arg_start_maximized)
1243 if arg_disk_cache_dir
is not None and arg_disk_cache_dir !=
'':
1244 chrome_option.add_argument(arg_disk_cache_dir)
1245 if arg_profile_directory
is not None and arg_profile_directory !=
'':
1246 chrome_option.add_argument(arg_profile_directory)
1247 if arg_user_data_dir
is not None and arg_user_data_dir !=
'':
1248 chrome_option.add_argument(arg_user_data_dir)
1249 chrome_option.add_argument(arg_dns_prefetch_disable)
1253 chrome_option.add_argument(self.
sessionId)
1256 if proxies
is not None:
1257 proxy_type, proxy_host, proxy_port, proxy_user, proxy_passwd = proxies
1259 self.
logger.debug(
"Proxy used from argument tuple: %s", str(proxies))
1261 proxies = proxy_type +
"://%s:%s@%s:%s" % (proxy_user, proxy_passwd, proxy_host, proxy_port)
1263 proxies = proxy_type +
"://%s:%s" % (proxy_host, proxy_port)
1264 chrome_option.add_argument(
"--proxy-server=" + proxies)
1266 if '--proxy-http' in headers
and headers[
'--proxy-http']
is not None and headers[
'--proxy-http'] !=
'':
1267 if '--proxy-http-domains' in headers
and headers[
'--proxy-http-domains']
is not None and\
1268 headers[
'--proxy-http-domains'] !=
'':
1270 domain = bool(dn
in headers[
'--proxy-http-domains'].split(
','))
1271 if self.
logger and domain
is False:
1272 self.
logger.debug(
"Proxy not used because domain `%s` not listed in `--proxy-http-domains`", str(dn))
1276 p = headers[
'--proxy-http'].replace(
'%3A',
':')
1278 self.
logger.debug(
"Proxy used from header: %s", str(p))
1279 chrome_option.add_argument(
"--proxy-server=" + p)
1281 return chrome_option
1294 if headers
is not None and 'tmp-dir' in headers:
1295 self.
tmpDir = headers[
'tmp-dir']
1298 if not os.path.isdir(self.
tmpDir):
1299 if logger
is not None:
1300 self.
logger.debug(
"Create temporary directory: %s", str(self.
tmpDir))
1302 except Exception, err:
1303 if self.
logger is not None:
1305 if logger
is not None:
1306 self.
logger.debug(
"Error temporary directories initialization: %s", str(err))
1308 if os.path.isdir(self.
tmpDir):
1318 for i
in xrange(1, tries):
1321 if os.path.isdir(self.
tmpDir):
1322 if self.
logger is not None:
1323 self.
logger.debug(
"Removing tmp dir: %s, try: %s", self.
tmpDir, str(i))
1324 shutil.rmtree(self.
tmpDir)
1327 except Exception, err:
1328 if self.
logger is not None:
1329 self.
logger.debug(
"Remove tmp dir: %s, try: %s, error: %s", self.
tmpDir, str(i), str(err))
1344 if self.
logger is not None:
1345 self.
logger.debug(
"Macro #%s in set of %s items:\n%s...",
1346 str(macroCounter), str(len(macro)), str(m)[:maxLenToLog])
1349 if self.
logger is not None:
1350 self.
logger.debug(
"Macro sleep: %s sec", str(m))
1356 if m.startswith(
'!'):
1359 params = m.split(
':')
1361 iDelay = int(params[0])
1362 iMaxIterations = int(params[1])
1364 elif len(params) > 1:
1365 iDelay = int(params[0])
1367 elif len(params) == 1:
1369 if self.
logger is not None:
1370 self.
logger.debug(
"Macro blocking iterative, delay: %s, max ierations: %s",
1371 str(iDelay), str(iMaxIterations))
1372 for i
in xrange(0, iMaxIterations):
1374 if self.
logger is not None:
1375 self.
logger.debug(
"Macro blocking iteration: %s of: %s", str(i + 1), str(iMaxIterations))
1376 if m.startswith(
'http://')
or m.startswith(
'https://')
or m.startswith(
'file://'):
1378 if m.startswith(
'file://'):
1379 with
open(m[7:].replace(
'%PID%', str(os.getpid())),
'r') as f: 1382 r = requests.get(m.replace(
'%PID%', str(os.getpid())))
1384 if self.
logger is not None:
1385 self.
logger.debug(
"Macro %s bytes loaded:\n%s...", str(len(str(m))), str(m)[:maxLenToLog])
1386 except Exception, err:
1387 error_msg =
'Error load macro code, URL: `' + str(m) +
'` : ' + str(err)
1389 if self.
logger is not None:
1390 self.
logger.debug(error_msg)
1394 r = self.
driver.execute_script(m)
1395 if self.
logger is not None:
1396 self.
logger.debug(
"Macro returned: %s", json.dumps(r))
1397 except Exception, err:
1398 error_msg =
'Error macro execution: ' + str(err) +
'; logs: ' + self.
getAllLogsAsString()
1400 if self.
logger is not None:
1401 self.
logger.debug(error_msg)
1404 if iType == 0
and r
is not None:
1405 if isinstance(r, (basestring, list, dict)):
1406 macroResults.append(r)
1407 if isinstance(r, (list, dict)):
1408 if self.
logger is not None:
1409 self.
logger.debug(
"Macro items returned: %s", str(len(r)))
1411 error_msg =
'Error macro result value, type is: ' + str(
type(r))
1413 if self.
logger is not None:
1414 self.
logger.debug(error_msg)
1418 if self.
logger is not None:
1419 self.
logger.debug(
"Macro blocking got `True` on iteration: %s, sleeped: %s sec",
1420 str(i + 1), str(int(iDelay) * i))
1422 elif r
is not True and iDelay > 0:
1423 if self.
logger is not None:
1424 self.
logger.debug(
"Macro blocking iteration: %s sleep on: %s sec", str(i + 1), str(iDelay))
1425 time.sleep(int(iDelay))
1426 if iType == 1
and r
is not True:
1427 if self.
logger is not None:
1428 self.
logger.debug(
"Macro blocking finished, but no `True` value returned!")
1432 return macroResults, error_code, error_msg
1439 return 'browser: ' + str(self.
driver.get_log(
'browser')) +
'; driver: ' + str(self.
driver.get_log(
'driver'))
1453 for mset
in macro[
'sets']:
1454 if 'name' not in mset:
1456 if 'repeat' not in mset:
1457 mset[
'repeat'] =
'1' 1458 if 'delay' not in mset:
1460 if self.
logger is not None:
1461 self.
logger.debug(
"Set:\n%s", str(mset))
1462 for i
in xrange(0, int(mset[
'repeat'])):
1463 if int(mset[
'delay']) > 0:
1464 time.sleep(int(mset[
'delay']))
1465 if self.
logger is not None:
1466 self.
logger.debug(
"Macro %s in set", str(i))
1474 if 'result_type' in macro:
1475 result_type = int(macro[
'result_type'])
1476 self.
logger.debug(
"Macro results type: %s", str(result_type))
1477 if 'result_content_type' in macro:
1478 content_type = str(macro[
'result_content_type'])
1479 self.
logger.debug(
"Macro results content type: %s", str(content_type))
1482 self.
logger.debug(
"Macro results before autodetect type: %s", str(macroResults))
1483 for r
in macroResults:
1484 if isinstance(r, basestring):
1486 self.
logger.debug(
"Macro results type autodetected as string content")
1488 elif isinstance(r, list):
1490 if isinstance(ri, basestring):
1492 self.
logger.debug(
"Macro results type autodetected as URLs list")
1495 macroResults =
''.
join(macroResults)
1497 macroResults = [item
for sublist
in macroResults
for item
in sublist]
1498 self.
logger.debug(
"Macro results after autodetect type: %s", str(macroResults))
1500 return macroResults, error_code, error_msg, content_type, result_type
1526 if 'logger' in kwargs[
'logger']:
1527 log = kwargs[
'logger']
1530 allowed_content_types = kwargs[
'allowed_content_types']
1534 log.debug(
"url: <%s>", url)
1537 response = urllib2.urlopen(url)
1538 headers_info = response.info()
1539 if headers_info
is not None:
1540 if headers_info.type
in allowed_content_types:
1541 if response
is not None:
1544 res.url = response.geturl()
1545 res.status_code = response.getcode()
1546 content_response = response.read()
1547 res.unicode_content = content_response
1548 res.str_content = content_response
1549 res.rendered_unicode_content = content_response
1550 res.content_size = len(content_response)
1552 headers[
"content-length"] = res.content_size
1553 headers[
"content-type"] = headers_info.type
1554 res.headers = headers
1556 res.redirects = history
1558 log.debug(
"URLLib return empty response.")
1560 log.debug(
"Content-Type not allowed. headers_info.type: %s", str(headers_info.type))
1562 log.debug(
"URLLib info is empty.")
1563 except urllib2.HTTPError, err:
1565 log.debug(
"Exception <%s>", str(err.code))
1588 localBuf = base64.b64decode(kwargs[
"inputContent"])
1590 localBuf = kwargs[
"inputContent"]
1592 res.content_size = len(localBuf)
1595 res.status_code = 200
1598 if res.encoding
is None:
1599 res.encoding =
"utf-8" 1600 res.unicode_content = localBuf
1601 res.str_content = localBuf
1602 res.rendered_unicode_content = localBuf
1657 def detect(self, content=None, contentType="html"):
1666 if contentType ==
'html':
1667 pattern =
r'<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s"\']*)?([^>]*?)[\s"\';]*charset\s*=[\s"\']*([^\s"\'/>]*)' 1668 matchObj = re.search(pattern, cnt, re.I | re.M | re.S)
1670 ret = matchObj.group(2)
1671 elif contentType ==
'xml':
1674 except Exception, err:
1675 logger.error(
"Exception: %s", str(err))
1677 if ret
is not None and ret
in CONSTS.charsetDetectorMap:
1678 logger.debug(
"Extracted wrong encoding '%s' from page replace to correct '%s'", ret,
1679 CONSTS.charsetDetectorMap[ret])
1680 ret = CONSTS.charsetDetectorMap[ret]
1686 """ Attempts to detect the character encoding of the xml file 1687 given by a file object fp. fp must not be a codec wrapped file 1690 The return value can be: 1691 - if detection of the BOM succeeds, the codec name of the 1692 corresponding unicode charset is returned 1694 - if BOM detection fails, the xml declaration is searched for 1695 the encoding attribute and its value returned. the "<" 1696 character has to be the very first in the file then (it's xml 1697 standard after all). 1699 - if BOM and xml declaration fail, None is returned. According 1700 to xml 1.0 it should be utf_8 then, but it wasn't detected by 1701 the means offered here. at least one can be pretty sure that a 1702 character coding including most of ASCII is used :-/ 1708 (0x00, 0x00, 0xFE, 0xFF) :
"utf_32_be",
1709 (0xFF, 0xFE, 0x00, 0x00) :
"utf_32_le",
1710 (0xFE, 0xFF,
None,
None) :
"utf_16_be",
1711 (0xFF, 0xFE,
None,
None) :
"utf_16_le",
1712 (0xEF, 0xBB, 0xBF,
None) :
"utf_8",
1719 (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
1722 bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
1723 if not bomDetection :
1724 bomDetection = bomDict.get((byte1, byte2, byte3,
None))
1725 if not bomDetection :
1726 bomDetection = bomDict.get((byte1, byte2,
None,
None))
1740 buff = fp.read(2048)
1743 xmlDeclPattern =
r""" 1744 ^<\?xml # w/o BOM, xmldecl starts with <?xml at the first byte 1745 .+? # some chars (version info), matched minimal 1746 encoding= # encoding attribute begins 1747 ["'] # attribute start delimiter 1748 (?P<encstr> # what's matched in the brackets will be named encstr 1749 [^"']+ # every character not delimiter (not overly exact!) 1750 ) # closes the brackets pair for the named group 1751 ["'] # attribute end delimiter 1752 .*? # some chars optionally (standalone decl or whitespace) 1756 xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE)
1759 match = xmlDeclRE.search(buff)
1763 return match.group(
"encstr")
list CHROME_PROCESS_NAMES
def initializeTmpDirs(self, headers)
string LOG_MESSAGE_SERVER_RESPONSE_409
def should_have_meta_res(self)
def getProcessDirs(self, process, dirsTemplate)
def chromeProcessesCleanup(self, headers)
string LOG_MESSAGE_RENDERRER_TIMEOUT
def __init__(self, dbWrapper=None, siteId=None)
def __init__(self, tmpDirOptions=None, log=None)
def init(dbWrapper=None, siteId=None)
int MACRO_RESULT_TYPE_AUTO
def execMacroExtended(self, macro)
dynamic_fetcher_result_type
def openT(self, url, headers, timeout, proxies, executable_path, macro)
def open(self, url, kwargs)
def getOptions(self, webdriver, headers, proxies, url)
int MACRO_RESULT_TYPE_CONTENT
int ERROR_CONNECTION_TIMED_OUT
string CHROME_DEBUG_LOG_NAME
def killProcess(self, pid, dirsTemplate=CHROME_DIRS_TEMPLATE, dirDeleteBeforeTimeout=DELAY_TERMINATE_AND_QUIT)
int ERROR_NAME_NOT_RESOLVED
int MACRO_RESULT_TYPE_URLS_LIST
def cleanup(self, state=0, headers=None)
def executeWithTimeout(func, args=None, kwargs=None, timeout=1, default=None, log=None)
int ERROR_PROXY_CONNECTION_FAILED
string CHROME_DIRS_TEMPLATE
def should_have_meta_res(self)
def detect(self, content=None, contentType="html")
def fixWrongXMLHeader(self, contentStr)
def open(self, url, method='get', headers=None, timeout=DEFAUIL_TIMEOUT, allow_redirects=True, proxies=None, auth=None, data=None, log=None, allowed_content_types=None, max_resource_size=None, max_redirects=1, filters=None, executable_path=None, depth=None, macro=None)
int ERROR_MACRO_RETURN_VALUE
def open(self, url, method='get', headers=None, timeout=100, allow_redirects=True, proxies=None, auth=None, data=None, log=None, allowed_content_types=None, max_resource_size=None, max_redirects=CONSTS.MAX_HTTP_REDIRECTS_LIMIT, filters=None, executable_path=None, depth=None, macro=None)
def execMacroSimple(self, macro)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
string LOG_MESSAGE_SERVER_RESPONSE_503
def get_fetcher(typ, dbWrapper=None, siteId=None)
int ERROR_TOO_MANY_REDIRECTS
int MACRO_RESULT_TYPE_DEFAULT
def xmlCharsetDetector(self, fp, buff=None)
def removeTmpDirs(self, delay=DELAY_TERMINATE_AND_QUIT, tries=3)
def __init__(self, content=None)
def checkRedirectsHook(r, args, kwargs)
float DELAY_TERMINATE_AND_QUIT
int ERROR_SERVICE_UNAVAILABLE
def getDomainNameFromURL(self, url, default='')
def open(self, url, method='get', headers=None, timeout=100, allow_redirects=True, proxies=None, auth=None, data=None, log=None, allowed_content_types=None, max_resource_size=None, max_redirects=CONSTS.MAX_HTTP_REDIRECTS_LIMIT, filters=None, executable_path=None, depth=None, macro=None)
def getAllLogsAsString(self)
int TMP_DIR_TYPE_INSTANTIATE
int ERROR_CONTENT_OR_COOKIE
def open(self, url, kwargs)
int ERROR_TUNNEL_CONNECTION_FAILED