2 @file NewspaperWrapper.py 3 @author Scorp <developers.hce@gmail.com> 4 @link http://hierarchical-cluster-engine.com/ 5 @copyright Copyright © 2013 IOIX Ukraine 6 @license http://hierarchical-cluster-engine.com/license/ 7 @package HCE project node API 12 from newspaper
import Article
13 from newspaper
import images
33 def __init__(self, url, title=u'', source_url=u'', config=None, isLoadUrls=True, **kwargs):
34 super(NewspaperWrapper, self).
__init__(url, title, source_url, config, **kwargs)
46 img_dimension = images.fetch_image_dimension(image, self.config.browser_user_agent)
47 except Exception, err:
48 logger.error(
"fetch_image_dimension: %s", str(err))
51 if img_dimension
is None:
52 img_dimensions.append({
"dim":
None,
"img_url": image})
54 img_dimensions.append({
"dim": img_dimension[0] * img_dimension[1],
"img_url": image})
55 ret = [img[
"img_url"]
for img
in sorted(img_dimensions, key=
lambda img: img[
"dim"], reverse=
True)]
65 argsResult = inspect.getargspec(getattr(self.extractor, methodName))
66 if argsResult
is not None and argsResult.args
is not None and \
67 len(argsResult.args) - (0
if argsResult.defaults
is None else len(argsResult.defaults)) == 2
and \
68 "article" in argsResult.args:
69 ret = getattr(self.extractor, methodName)(self)
71 ret = getattr(self.extractor, methodName)(self.url, self.clean_doc)
77 if self.clean_doc
is not None:
79 self.set_meta_img(meta_img_url)
81 if self.clean_top_node
is not None and not self.has_top_image():
85 if not self.has_top_image()
and self.
isLoadUrls:
86 self.set_reddit_top_img()
88 logger.debug(
">>> not load urls")
90 if self.
isLoadUrls and self.clean_doc
is not None:
103 logger.debug(
">>> not load urls")
def sort_images(self, imgs)
def __init__(self, url, title=u'', source_url=u'', config=None, isLoadUrls=True, kwargs)
def versionnedWrapper(self, methodName)
def set_top_img(self, src_url)