3 HCE project, Python bindings, Crawler application. 7 @file ftest_dc_fetchers.py 8 @author bgv <bgv.hce@gmail.com> 9 @link: http://hierarchical-cluster-engine.com/ 10 @copyright: Copyright © 2015 IOIX Ukraine 11 @license: http://hierarchical-cluster-engine.com/license/ 32 logger = logging.getLogger(
'hce')
33 logger.setLevel(logging.DEBUG)
36 ch = logging.StreamHandler()
37 ch.setLevel(logging.DEBUG)
40 formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s')
43 ch.setFormatter(formatter)
51 if __name__ ==
"__main__":
54 url =
'//192.168.253.114/template_scraping/index1.html' 65 base =
'https://regex101.com\a\\\\\\\\\\b/index2.html' 66 url =
'https://regex101.com\a\\\\\\\\\\b/34\\45/index.html' 68 supportProtocols = [
'http',
'https']
73 baseUrls = [
'http://www.developers.net/a/b/c/d/e/index.html',
74 'https://alexv:1234@developers.net/a/b/c/d/e/index.html',
75 'https://alexv:1234@developers.net:8080/a/b/c/d/e/index.html',
76 'http://www.developers.net/a/b/c/d/e/developers.net/a/b/c/d/e/index.html',
77 'https://regex101.com\a\\\\\\\\\\b/index2.html']
79 urls = [
'http://www.cwi.nl/%7Eguido/FAQ.html',
84 '//media.townhall.com/townhall/reu/ha/2017\33\\45\c70a08d0-cf34-45c3-b28d-5b685f4ed8bd.jpg',
85 '//media.townhall.com/townhall/reu/ha/2017/33/c70a08d0-cf34-45c3-b28d-5b685f4ed8bd.jpg',
86 '//media.townhall.com/townhall/reu/ha/2017%5C33%5Cc70a08d0-cf34-45c3-b28d-5b685f4ed8bd.jpg',
87 'https://regex101.com\a\\\\\\\\\\b/index.html']
94 baseUrls = [
'https://www.w3schools.com/images/']
95 urls = [
'://stickman.gif']
97 for baseUrl
in baseUrls:
def urlNormalization(base, url, supportProtocols=None, log=None)