HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
NewspaperWrapper.py
Go to the documentation of this file.
1 """@package docstring
2  @file NewspaperWrapper.py
3  @author Scorp <developers.hce@gmail.com>
4  @link http://hierarchical-cluster-engine.com/
5  @copyright Copyright &copy; 2013 IOIX Ukraine
6  @license http://hierarchical-cluster-engine.com/license/
7  @package HCE project node API
8  @since 0.1
9 """
10 
11 import inspect
12 from newspaper import Article
13 from newspaper import images
14 import app.Utils as Utils # pylint: disable=F0401
15 
16 # Logger initialization
17 logger = Utils.MPLogger().getLogger()
18 
19 
20 # # Class NewspaperWrapper is owner wrapper for Articla class - main class of newspaper library
21 #
22 class NewspaperWrapper(Article):
23 
24 
25  # #Class Constructor
26  #
27  # @param url - resource's url
28  # @param title - param that internally used in newspaper library
29  # @param source_url - param that internally used in newspaper library
30  # @param config - param that internally used in newspaper library
31  # @param isLoadUrls - bool value, that indicates - load resources images or not
32  # @param kwargs - params that internally used in newspaper library
33  def __init__(self, url, title=u'', source_url=u'', config=None, isLoadUrls=True, **kwargs):
34  super(NewspaperWrapper, self).__init__(url, title, source_url, config, **kwargs)
35  self.isLoadUrls = isLoadUrls
36 
37 
38  # # sort_images method, sorts resources images
39  #
40  # @param imgs - incoming list of resources images
41  # @return just sorted resources images
42  def sort_images(self, imgs):
43  img_dimensions = []
44  for image in imgs:
45  try:
46  img_dimension = images.fetch_image_dimension(image, self.config.browser_user_agent)
47  except Exception, err:
48  logger.error("fetch_image_dimension: %s", str(err))
49  img_dimension = None
50 
51  if img_dimension is None:
52  img_dimensions.append({"dim": None, "img_url": image})
53  else:
54  img_dimensions.append({"dim": img_dimension[0] * img_dimension[1], "img_url": image})
55  ret = [img["img_url"] for img in sorted(img_dimensions, key=lambda img: img["dim"], reverse=True)]
56  return ret
57 
58 
59  # # versionnedWrapper wrap method, that allows call of class methods with various params list in various
60  # library versions
61  #
62  # @param methodName - name of wrapped method
63  # @return wrapped method return value
64  def versionnedWrapper(self, methodName):
65  argsResult = inspect.getargspec(getattr(self.extractor, methodName))
66  if argsResult is not None and argsResult.args is not None and \
67  len(argsResult.args) - (0 if argsResult.defaults is None else len(argsResult.defaults)) == 2 and \
68  "article" in argsResult.args:
69  ret = getattr(self.extractor, methodName)(self) # pylint: disable=E1101
70  else:
71  ret = getattr(self.extractor, methodName)(self.url, self.clean_doc) # pylint: disable=E1101
72  return ret
73 
74 
75  # # fetch_images overloaded version on Article.fetch_images method
76  def fetch_images(self):
77  if self.clean_doc is not None: # pylint: disable=E1101
78  meta_img_url = self.versionnedWrapper("get_meta_img_url")
79  self.set_meta_img(meta_img_url) # pylint: disable=E1101
80 
81  if self.clean_top_node is not None and not self.has_top_image(): # pylint: disable=E1101
82  first_img = self.versionnedWrapper("get_first_img_url")
83  self.set_top_img(first_img)
84 
85  if not self.has_top_image() and self.isLoadUrls: # pylint: disable=E1101
86  self.set_reddit_top_img()
87  else:
88  logger.debug(">>> not load urls")
89 
90  if self.isLoadUrls and self.clean_doc is not None:
91  imgs = self.versionnedWrapper("get_img_urls")
92  imgs = self.sort_images(imgs)
93  self.set_imgs(imgs)
94 
95 
96  # # fetch_images overloaded version on Article.set_top_img method
97  #
98  # @param src_url - url of source image
99  def set_top_img(self, src_url):
100  if self.isLoadUrls:
101  super(NewspaperWrapper, self).set_top_img(src_url)
102  else:
103  logger.debug(">>> not load urls")
def __init__(self, url, title=u'', source_url=u'', config=None, isLoadUrls=True, kwargs)