HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_crawler.Fetcher.BaseFetcher Class Reference
Inheritance diagram for dc_crawler.Fetcher.BaseFetcher:
Collaboration diagram for dc_crawler.Fetcher.BaseFetcher:

Public Member Functions

def __init__ (self)
 
def open (self, url, method='get', headers=None, timeout=100, allow_redirects=True, proxies=None, auth=None, data=None, log=None, allowed_content_types=None, max_resource_size=None, max_redirects=CONSTS.MAX_HTTP_REDIRECTS_LIMIT, filters=None, executable_path=None, depth=None, macro=None)
 
def should_have_meta_res (self)
 
def getDomainNameFromURL (self, url, default='')
 

Static Public Member Functions

def init (dbWrapper=None, siteId=None)
 
def get_fetcher (typ, dbWrapper=None, siteId=None)
 

Public Attributes

 connectionTimeout
 
 logger
 

Static Public Attributes

 fetchers = None
 
int TYP_NORMAL = 1
 
int TYP_DYNAMIC = 2
 
int TYP_URLLIB = 5
 
int TYP_CONTENT = 6
 
int TYP_AUTO = 7
 
float CONNECTION_TIMEOUT = 1.0
 

Detailed Description

Definition at line 50 of file Fetcher.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_crawler.Fetcher.BaseFetcher.__init__ (   self)

Definition at line 65 of file Fetcher.py.

65  def __init__(self):
66  self.connectionTimeout = self.CONNECTION_TIMEOUT
67  self.logger = None
68 
69 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ get_fetcher()

def dc_crawler.Fetcher.BaseFetcher.get_fetcher (   typ,
  dbWrapper = None,
  siteId = None 
)
static

Definition at line 121 of file Fetcher.py.

121  def get_fetcher(typ, dbWrapper=None, siteId=None):
122  if not BaseFetcher.fetchers:
123  BaseFetcher.init(dbWrapper, siteId)
124  if typ in BaseFetcher.fetchers:
125  return BaseFetcher.fetchers[typ]
126  else:
127  raise BaseException("unsupported fetch type:%s" % (typ,))
128 
129 

◆ getDomainNameFromURL()

def dc_crawler.Fetcher.BaseFetcher.getDomainNameFromURL (   self,
  url,
  default = '' 
)

Definition at line 142 of file Fetcher.py.

142  def getDomainNameFromURL(self, url, default=''):
143  ret = default
144 
145  urlParts = urlsplit(url)
146  if len(urlParts) > 1:
147  ret = urlParts[1]
148 
149  return ret
150 
151 
152 # # Check redirects hook
153 #
154 #
Here is the caller graph for this function:

◆ init()

def dc_crawler.Fetcher.BaseFetcher.init (   dbWrapper = None,
  siteId = None 
)
static

Definition at line 71 of file Fetcher.py.

71  def init(dbWrapper=None, siteId=None):
72  # enumerate content_types we don't want to fetch
73  BaseFetcher.prohibited_conten_types = ["audio/mpeg", "application/pdf"]
74 
75  BaseFetcher.fetchers = {
76  BaseFetcher.TYP_NORMAL : RequestsFetcher(dbWrapper, siteId),
77  BaseFetcher.TYP_DYNAMIC: SeleniumFetcher(),
78  BaseFetcher.TYP_URLLIB: URLLibFetcher(),
79  BaseFetcher.TYP_CONTENT: ContentFetcher()
80  }
81 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ open()

def dc_crawler.Fetcher.BaseFetcher.open (   self,
  url,
  method = 'get',
  headers = None,
  timeout = 100,
  allow_redirects = True,
  proxies = None,
  auth = None,
  data = None,
  log = None,
  allowed_content_types = None,
  max_resource_size = None,
  max_redirects = CONSTS.MAX_HTTP_REDIRECTS_LIMIT,
  filters = None,
  executable_path = None,
  depth = None,
  macro = None 
)

Definition at line 109 of file Fetcher.py.

109  macro=None):
110  if headers is None:
111  headers = {}
112  del url, method, headers, timeout, allow_redirects, proxies, auth, data, log, allowed_content_types, \
113  max_resource_size, max_redirects, filters, executable_path, depth, macro
114 
115 
Here is the caller graph for this function:

◆ should_have_meta_res()

def dc_crawler.Fetcher.BaseFetcher.should_have_meta_res (   self)

Definition at line 133 of file Fetcher.py.

133  def should_have_meta_res(self):
134 
135  return False
136 

Member Data Documentation

◆ CONNECTION_TIMEOUT

float dc_crawler.Fetcher.BaseFetcher.CONNECTION_TIMEOUT = 1.0
static

Definition at line 60 of file Fetcher.py.

◆ connectionTimeout

dc_crawler.Fetcher.BaseFetcher.connectionTimeout

Definition at line 66 of file Fetcher.py.

◆ fetchers

dc_crawler.Fetcher.BaseFetcher.fetchers = None
static

Definition at line 52 of file Fetcher.py.

◆ logger

dc_crawler.Fetcher.BaseFetcher.logger

Definition at line 67 of file Fetcher.py.

◆ TYP_AUTO

int dc_crawler.Fetcher.BaseFetcher.TYP_AUTO = 7
static

Definition at line 58 of file Fetcher.py.

◆ TYP_CONTENT

int dc_crawler.Fetcher.BaseFetcher.TYP_CONTENT = 6
static

Definition at line 57 of file Fetcher.py.

◆ TYP_DYNAMIC

int dc_crawler.Fetcher.BaseFetcher.TYP_DYNAMIC = 2
static

Definition at line 55 of file Fetcher.py.

◆ TYP_NORMAL

int dc_crawler.Fetcher.BaseFetcher.TYP_NORMAL = 1
static

Definition at line 54 of file Fetcher.py.

◆ TYP_URLLIB

int dc_crawler.Fetcher.BaseFetcher.TYP_URLLIB = 5
static

Definition at line 56 of file Fetcher.py.


The documentation for this class was generated from the following file: