5 HCE project, Python bindings, Distributed Tasks Manager application. 6 Event objects definitions. 10 @author Oleksii <developers.hce@gmail.com> 11 @link: http://hierarchical-cluster-engine.com/ 12 @copyright: Copyright © 2013-2014 IOIX Ukraine 13 @license: http://hierarchical-cluster-engine.com/license/ 25 from subprocess
import Popen
26 from subprocess
import PIPE
28 import MySQLdb.cursors
29 from contextlib
import closing
33 from urlparse
import urlparse
42 logging.basicConfig(filename=
"prepairer.log", filemode=
"w")
43 logger = logging.getLogger(
"Prepairer")
44 logger.setLevel(
"DEBUG")
47 dc_sites_db_connect =
None 48 dc_urls_db_connect =
None 51 site_templates_dic = {}
56 query =
"SELECT sites_urls.URL, sites_properties.`Value` FROM `sites_properties` INNER JOIN sites_urls ON sites_urls.Site_Id = sites_properties.Site_Id AND sites_properties.Name = 'template'" 63 a = urlparse(url).netloc.split(
":")[0].split(
".")
66 if len(a) > 2
and a[-3] !=
"www":
68 b = str(arr[-3] +
"." + arr[-2] +
"." + arr[-1])
71 b = str(arr[-2] +
"." + arr[-1])
78 for template
in templates:
80 site_templates_dic[template[
"URL"]] = template[
"Value"]
81 with open(
"sites_templates_dic",
"w")
as f:
82 f.write(json.dumps(site_templates_dic))
83 for (key, value)
in site_templates_dic.items():
85 md5 = hashlib.md5(url).hexdigest()
86 templates_dic[md5] = MySQLdb.escape_string(value)
92 with closing(db_connector.cursor(MySQLdb.cursors.DictCursor))
as cursor:
95 return cursor.fetchall()
96 except mdb.Error
as err:
97 db_connector.rollback()
103 global dc_sites_db_connect
104 global dc_urls_db_connect
111 db_dc_sites =
"dc_sites" 112 db_dc_urls =
"dc_urls" 114 dc_sites_db_connect = mdb.connect(dbHost, dbUser, dbPWD, db_dc_sites, dbPort)
115 dc_urls_db_connect = mdb.connect(dbHost, dbUser, dbPWD, db_dc_urls, dbPort)
121 input_url = input_url.strip()
127 norm_url =
cutURL(input_url)
130 site =
Site(norm_url)
133 site_filter_pattern =
".*" + norm_url +
".*" 134 site_filters =
SiteFilter(site.id, site_filter_pattern)
138 if site.id
in templates_dic:
139 site.properties[
"template"] = templates_dic[site.id]
144 site.filters = [site_filters]
153 url =
URL(site.id, input_url)
154 url.status = URL.STATUS_SELECTED_CRAWLING
155 url.type = URL.TYPE_SINGLE
160 file_name =
"site_" + str(site.id) +
".json" 161 open(file_name,
"w").write(site.toJSON())
162 cmd =
"./dc-client.py --config=../ini/dc-client.ini --command=SITE_NEW --file=./%s" % file_name
163 process = Popen(cmd, stdout=PIPE, stdin=PIPE, shell=
True, close_fds=
True)
164 (output, err) = process.communicate()
165 exit_code = process.wait()
166 open(
"dc-client_new_site_output.txt",
"w").write(output)
171 file_name =
"url_" + str(site.id) +
".json" 172 open(file_name,
"w").write(
"[" + url.toJSON() +
"]")
173 cmd =
"./dc-client.py --config=../ini/dc-client.ini --command=URL_NEW --file=./%s" % file_name
174 process = Popen(cmd, stdout=PIPE, stdin=PIPE, shell=
True, close_fds=
True)
175 (output, err) = process.communicate()
176 exit_code = process.wait()
177 open(
"dc-client_new_url_output.txt",
"w").write(output)
182 url_updated =
URLUpdate(site.id, input_url)
183 url_updated.status = URL.STATUS_SELECTED_CRAWLING
185 file_name =
"url_" + str(url_updated.urlMd5) +
".json" 186 open(file_name,
"w").write(
"[" + url_updated.toJSON() +
"]")
187 cmd =
"./dc-client.py --config=../ini/dc-client.ini --command=URL_UPDATE --file=./%s" % file_name
188 process = Popen(cmd, stdout=PIPE, stdin=PIPE, shell=
True, close_fds=
True)
189 (output, err) = process.communicate()
190 exit_code = process.wait()
191 open(
"dc-client_update_url_output.txt",
"w").write(output)
197 site_updated.state = Site.STATE_ACTIVE
198 file_name =
"updated_site_" + str(site_updated.id) +
".json" 199 open(file_name,
"w").write(site_updated.toJSON())
200 cmd =
"./dc-client.py --config=../ini/dc-client.ini --command=SITE_UPDATE --file=./%s" % file_name
201 process = Popen(cmd, stdout=PIPE, stdin=PIPE, shell=
True, close_fds=
True)
202 (output, err) = process.communicate()
203 exit_code = process.wait()
204 open(
"dc-client_update_site_output.txt",
"w").write(output)
209 for input_url
in sys.stdin:
210 input_url = input_url.strip()
211 logger.debug(input_url)
215 open(site.id,
"w").write(input_url)
223 bItem =
BatchItem(site.id, url.urlMd5, url)
225 input_object =
Batch(11, url_list)
226 input_pickled_object = pickle.dumps(input_object)
227 print input_pickled_object
230 if __name__ ==
"__main__":
def createSiteObj(input_url)
def executeQuery(db_connector, query)
def readTemplatesFromMySQL()
def updateURL(input_url, site)
def createURLObj(site, input_url)