HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
search_engine_parser.py
Go to the documentation of this file.
1 #!/usr/bin/python
2 
3 
4 """
5 HCE project, Python bindings, Distributed Tasks Manager application.
6 Event objects definitions.
7 
8 @package: dc
9 @file prepairer.py
10 @author Oleksii <developers.hce@gmail.com>
11 @link: http://hierarchical-cluster-engine.com/
12 @copyright: Copyright &copy; 2013-2014 IOIX Ukraine
13 @license: http://hierarchical-cluster-engine.com/license/
14 @since: 0.1
15 """
16 
17 # Example of using
18 # ./search_engine_parser.py < ../data/ftests/test_search_engine/list_of_urls.txt
19 
20 import ppath
21 from ppath import sys
22 
23 import logging
24 from subprocess import Popen
25 from subprocess import PIPE
26 import pickle
27 import hashlib
28 import requests
29 import datetime
30 from dc_processor.ScraperInData import ScraperInData
31 import app.Utils
32 
33 logging.basicConfig(filename="/tmp/search_engine.log", filemode="w")
34 logger = logging.getLogger("search_engine")
35 logger.setLevel("DEBUG")
36 
37 
38 def process(input_data):
39  logger.debug("input: %s" % input_data)
40  splitted_data = input_data.split(',')
41  url = splitted_data[0]
42  site_id = "d57f144e7b26c9976769ea94f18b9064" if "google" in url else "1fe592caf03fd50c5f065c30f82b13bb"
43  #site_id = hashlib.md5(app.Utils.UrlParser.generateDomainUrl(url)).hexdigest()
44  logger.debug("site_id: %s" % str(site_id))
45  template = None
46  if len(splitted_data)==2:
47  template = splitted_data[1]
48  content = getContent(url)
49  lastModified = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
50  input = ScraperInData(url, None, site_id, content, "", None, lastModified, None)
51  input_pickled_object = pickle.dumps(input)
52  #logger.debug("scraper input: %s", str(input_pickled_object))
53  cmd = "./scraper.py --config=../ini/scraper_search_engine.ini"
54  process = Popen(cmd, stdout=PIPE, stdin=PIPE, stderr=PIPE, shell=True, close_fds=True)
55  (output, err) = process.communicate(input=input_pickled_object)
56  logger.debug("scraper response output: %s", str(output))
57  logger.debug("scraper response error: %s", str(err))
58  exit_code = process.wait()
59  return output
60 
61 
62 def getContent(url):
63  # wget -S --no-check-certificate -U "Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3" "https://www.google.com/search?q=mac+os"
64  cmd = "wget -qO- -S --no-check-certificate -U 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3' '" + url + "'"
65  # cmd = "wget -qO- -S --no-check-certificate -U 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3' 'https://www.google.com/search?q=mac+os'"
66  process = Popen(cmd, stdout=PIPE, stdin=PIPE, stderr=PIPE, shell=True, close_fds=True)
67  (output, err) = process.communicate()
68  exit_code = process.wait()
69  # output = open("google.out", "rb").read()
70  # raw_html = output
71  # open("/tmp/google.out", "wb").write(output)
72  #logger.debug("Raw content output: %s", output)
73  # logger.debug("Raw content error: %s", str(err))
74  # print raw_html
75 
76  # headers = {}
77  # headers["User-Agent"] = "Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3"
78  # content = requests.get(url=url, headers=headers, verify=False)
79  # open("del.txt", "wb").write(output)
80  # logger.debug("request response: %s", content.text)
81  return output
82 
83 
84 if __name__ == "__main__":
85  for input_url in sys.stdin:
86  output = process(input_url)
87  print output