HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
ftest_dc_ProcessorTask_batch_processing.py
Go to the documentation of this file.
1 #!/usr/bin/python
2 
3 
4 """
5 HCE project, Python bindings, Distributed Tasks Manager application.
6 Event objects definitions.
7 
8 @package: dc
9 @file ftest_dc_ProcessorTask_batch_processing.py
10 @author Oleksii <developers.hce@gmail.com>
11 @link: http://hierarchical-cluster-engine.com/
12 @copyright: Copyright &copy; 2013-2014 IOIX Ukraine
13 @license: http://hierarchical-cluster-engine.com/license/
14 @since: 0.1
15 """
16 
17 
18 import md5
19 # import pickle
20 try:
21  import cPickle as pickle
22 except ImportError:
23  import pickle
24 from collections import namedtuple
25 from subprocess import Popen
26 from subprocess import PIPE
27 from dc.EventObjects import Batch
28 from dc.EventObjects import BatchItem
29 from dtm.EventObjects import GeneralResponse
30 
31 """
32 #url = "http://www.yomiuri.co.jp/economy/20140424-OYT1T50032.html?from=ycont_top_txt"
33 #siteId = ""
34 #urlId = "fb04cc869245f17a34e1691054e6b5ea"
35 #insert into `urls_0`(`URLMd5`, `URL`) values("fb04cc869245f17a34e1691054e6b5ea", "http://www.yomiuri.co.jp/economy/20140424-OYT1T50032.html?from=ycont_top_txt")
36 """
37 
38 # base template update
39 # UPDATE `sites_properties` SET `Value`="{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//meta[contains(@property,'title')]/@content\", \"//meta[contains(@name,'title')]/@content\", '//title/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="130aadc544a6a69d92524166afc41745" AND `Name`="template"
40 siteId1 = str(md5.new("http://www.yomiuri.co.jp").hexdigest())
41 urlId1 = str(md5.new("http://www.yomiuri.co.jp/politics/20140422-OYT1T50105.html?from=ycont_top_txt").hexdigest())
42 bItem1 = BatchItem(siteId1, urlId1)
43 # insert into `urls_8720b16ad7304605daf987f3585839d0`(`Site_Id`, `URLMd5`, `URL`) values("8720b16ad7304605daf987f3585839d0","9b2f8c19c95fbf776e50320299f4e197", "http://www.yomiuri.co.jp/politics/20140422-OYT1T50105.html?from=ycont_top_txt")
44 # INSERT INTO `sites_properties`(`Site_Id`, `Name`, `Value`, `UDate`, `CDate`) VALUES ("8720b16ad7304605daf987f3585839d0","template","{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//meta[contains(@property,'title')]/@content\", \"//meta[contains(@name,'title')]/@content\", '//title/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}",NOW(),NOW())
45 # template pubdate
46 # UPDATE `sites_properties` SET `Value`="{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@property,'article:published_time')]/@content\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//meta[contains(@property,'title')]/@content\", \"//meta[contains(@name,'title')]/@content\", '//title/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="8720b16ad7304605daf987f3585839d0" AND `Name`="template"
47 # template title
48 # UPDATE `sites_properties` SET `Value`="{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@property,'article:published_time')]/@content\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': ['//article/h1/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="8720b16ad7304605daf987f3585839d0" AND `Name`="template"
49 # template media
50 # UPDATE `sites_properties` SET `Value`="{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@property,'article:published_time')]/@content\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//img[contains(@data-original,'')]/@data-original\"], 'title': ['//article/h1/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="8720b16ad7304605daf987f3585839d0" AND `Name`="template"
51 # template description
52 # UPDATE `sites_properties` SET `Value`="{'description': [\"//p[contains(@class,'par1')]/text()\"], 'pubdate': [\"//meta[contains(@property,'article:published_time')]/@content\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//img[contains(@data-original,'')]/@data-original\"], 'title': ['//article/h1/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="8720b16ad7304605daf987f3585839d0" AND `Name`="template"
53 # UPDATE `sites_properties` SET `Value`="{'description': [\"//p[contains(@class,'par1')]//text()\"], 'pubdate': [\"//meta[contains(@property,'article:published_time')]/@content\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//img[contains(@data-original,'')]/@data-original\"], 'title': ['//article/h1/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@property,'article:published_time')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//p[contains(@itemprop,'articleBody')]/text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="8720b16ad7304605daf987f3585839d0" AND `Name`="template"
54 
55 siteId2 = str(md5.new("http://www.asahi.com").hexdigest())
56 urlId2 = str(md5.new("http://www.asahi.com/articles/ASG4Q4D0DG4QUTQP00Z.html").hexdigest())
57 bItem2 = BatchItem(siteId2, urlId2)
58 # insert into `urls_bde344064dc99424033f1d39a0e63a11`(`Site_Id`, `URLMd5`, `URL`) values("bde344064dc99424033f1d39a0e63a11","6542b10f072e83dac21e44e2b665495c", "http://www.asahi.com/articles/ASG4Q4D0DG4QUTQP00Z.html")
59 # INSERT INTO `sites_properties`(`Site_Id`, `Name`, `Value`, `UDate`, `CDate`) VALUES ("bde344064dc99424033f1d39a0e63a11","template","{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//meta[contains(@property,'title')]/@content\", \"//meta[contains(@name,'title')]/@content\", '//title/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}",NOW(),NOW())
60 # INSERT INTO `dc_sites`.`sites` (`Id`, `UDate`, `TcDate`, `CDate`, `Resources`, `Iterations`, `State`, `Priority`, `MaxURLs`, `MaxResources`, `MaxErrors`, `MaxResourceSize`, `RequestDelay`, `HTTPTimeout`, `ErrorMask`, `Errors`, `URLType`) VALUES ('bde344064dc99424033f1d39a0e63a11', NULL, NULL, '2014-04-16 14:09:06', '0', '0', '1', '0', '0', '0', '0', '0', '0', '30000', '0', '0', '0');
61 # UPDATE `sites_properties` SET `Value`="{'description': [\"//div[contains(@class,'ArticleText')]/p//text()\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//meta[contains(@name,'TITLE')]/@content\", \"//meta[contains(@name,'title')]/@content\", '//title/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="bde344064dc99424033f1d39a0e63a11" AND `Name`="template"
62 # UPDATE `sites_properties` SET `Value`="{'description': [\"//div[contains(@class,'ArticleText')]/p//text()\"], 'pubdate': [\"//meta[contains(@name,'RELEASE_DATE')]/@content\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//meta[contains(@name,'TITLE')]/@content\", \"//meta[contains(@name,'title')]/@content\", '//title/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="bde344064dc99424033f1d39a0e63a11" AND `Name`="template"
63 
64 siteId3 = str(md5.new("http://mainichi.jp").hexdigest())
65 urlId3 = str(md5.new("http://mainichi.jp/opinion/news/20140422k0000e070249000c.html").hexdigest())
66 bItem3 = BatchItem(siteId3, urlId3)
67 # insert into `urls_130aadc544a6a69d92524166afc41745`(`Site_Id`, `URLMd5`, `URL`) values("130aadc544a6a69d92524166afc41745","454dfa18e4bdc40f3e649d3c0cbeb09e", "http://mainichi.jp/opinion/news/20140422k0000e070249000c.html")
68 # INSERT INTO `sites_properties`(`Site_Id`, `Name`, `Value`, `UDate`, `CDate`) VALUES ("130aadc544a6a69d92524166afc41745","template","{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//meta[contains(@property,'title')]/@content\", \"//meta[contains(@name,'title')]/@content\", '//title/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}",NOW(),NOW())
69 # INSERT INTO `dc_sites`.`sites` (`Id`, `UDate`, `TcDate`, `CDate`, `Resources`, `Iterations`, `State`, `Priority`, `MaxURLs`, `MaxResources`, `MaxErrors`, `MaxResourceSize`, `RequestDelay`, `HTTPTimeout`, `ErrorMask`, `Errors`, `URLType`) VALUES ('130aadc544a6a69d92524166afc41745', NULL, NULL, '2014-04-16 14:09:06', '0', '0', '1', '0', '0', '0', '0', '0', '0', '30000', '0', '0', '0');
70 
71 # UPDATE `sites_properties` SET `Value`="{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//meta[contains(@property,'title')]/@content\", \"//meta[contains(@name,'title')]/@content\", '//title/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="130aadc544a6a69d92524166afc41745" AND `Name`="template"
72 # UPDATE `sites_properties` SET `Value`="{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//div[contains(@class,'MainTopics')]/h2[contains(@class,'NewsTitle')]/a/text()\"], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="130aadc544a6a69d92524166afc41745" AND `Name`="template"
73 # UPDATE `sites_properties` SET `Value`="{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@class,'NewsBody')]/div[contains(@class,'TopPhoto')]/a/img/@src\"], 'title': [\"//div[contains(@class,'MainTopics')]/h2[contains(@class,'NewsTitle')]/a/text()\"], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="130aadc544a6a69d92524166afc41745" AND `Name`="template"
74 # UPDATE `sites_properties` SET `Value`="{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@class,'NewsBody')]/div[contains(@class,'TopPhoto')]/a/img/@src\"], 'title': [\"//div[contains(@class,'MainTopics')]/h2[contains(@class,'NewsTitle')]/a/text()\"], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'NewsBody')]/p/text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="130aadc544a6a69d92524166afc41745" AND `Name`="template"
75 # UPDATE `sites_properties` SET `Value`="{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@class,'NewsBody')]/div[contains(@class,'TopPhoto')]/a/img/@src\"], 'title': [\"//div[contains(@class,'MainTopics')]/h2[contains(@class,'NewsTitle')]/a/text()\", \"//meta[contains(@property,'og:title')]/@content\"], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'NewsBody')]/p/text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="130aadc544a6a69d92524166afc41745" AND `Name`="template"
76 
77 siteId4 = str(md5.new("http://sankei.jp.msn.com").hexdigest())
78 urlId4 = str(md5.new("http://sankei.jp.msn.com/politics/news/140401/stt14040112470002-n1.htm").hexdigest())
79 bItem4 = BatchItem(siteId4, urlId4)
80 # insert into `urls_ab3b37f575d82d74220865746e4778df`(`Site_Id`, `URLMd5`, `URL`) values("ab3b37f575d82d74220865746e4778df","276c2fdeba9e403290ff7fb0843ae088", "http://sankei.jp.msn.com/politics/news/140401/stt14040112470002-n1.htm")
81 # INSERT INTO `sites_properties`(`Site_Id`, `Name`, `Value`, `UDate`, `CDate`) VALUES ("ab3b37f575d82d74220865746e4778df","template","{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//meta[contains(@property,'title')]/@content\", \"//meta[contains(@name,'title')]/@content\", '//title/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}",NOW(),NOW())
82 # INSERT INTO `dc_sites`.`sites` (`Id`, `UDate`, `TcDate`, `CDate`, `Resources`, `Iterations`, `State`, `Priority`, `MaxURLs`, `MaxResources`, `MaxErrors`, `MaxResourceSize`, `RequestDelay`, `HTTPTimeout`, `ErrorMask`, `Errors`, `URLType`) VALUES ('ab3b37f575d82d74220865746e4778df', NULL, NULL, '2014-04-16 14:09:06', '0', '0', '1', '0', '0', '0', '0', '0', '0', '30000', '0', '0', '0');
83 # UPDATE `sites_properties` SET `Value`="{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//span[contains(@id,'__r_article_img__')]/img/@src\"], 'title': [\"//meta[contains(@property,'title')]/@content\", \"//meta[contains(@name,'title')]/@content\", '//title/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="ab3b37f575d82d74220865746e4778df" AND `Name`="template"
84 # UPDATE `sites_properties` SET `Value`="{'description': [\"//div[contains(@class,'NewsTextFull')]/p[1]/text()\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//span[contains(@id,'__r_article_img__')]/img/@src\"], 'title': [\"//h2[contains(@id,'NewsTitle')]//text()\"], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'NewsTextFull')]/p/text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="ab3b37f575d82d74220865746e4778df" AND `Name`="template"
85 # UPDATE `sites_properties` SET `Value`="{'description': [\"//div[contains(@class,'NewsTextFull')]/p[1]/text()\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//span[contains(@id,'__r_article_img__')]/img/@src\"], 'title': [\"//h2[contains(@id,'NewsTitle')]//text()\"], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@property,'og:url')]/@content\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'NewsTextFull')]/p/text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="ab3b37f575d82d74220865746e4778df" AND `Name`="template"
86 # UPDATE `sites_properties` SET `Value`="{'description': [\"//div[contains(@class,'NewsText')]/p[1]/text()\", \"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//span[contains(@id,'__r_article_img__')]/img/@src\"], 'title': [\"//h2[contains(@id,'NewsTitle')]//text()\"], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@property,'og:url')]/@content\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'NewsText')]/p/text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="ab3b37f575d82d74220865746e4778df" AND `Name`="template"
87 
88 siteId5 = str(md5.new("http://www.jiji.com").hexdigest())
89 urlId5 = str(md5.new("http://www.jiji.com/jc/zc?k=201403/2014033000221&rel=j&g=int&relid=1_1").hexdigest())
90 bItem5 = BatchItem(siteId5, urlId5)
91 # insert into `urls_5dfd57388380f1b8ece0bb133b9a61f7`(`Site_Id`, `URLMd5`, `URL`) values("5dfd57388380f1b8ece0bb133b9a61f7","0f67397c8374bd8a3597b451b6960e22", "http://www.jiji.com/jc/zc?k=201403/2014033000221&rel=j&g=int&relid=1_1")
92 # INSERT INTO `sites_properties`(`Site_Id`, `Name`, `Value`, `UDate`, `CDate`) VALUES ("5dfd57388380f1b8ece0bb133b9a61f7","template","{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//meta[contains(@property,'title')]/@content\", \"//meta[contains(@name,'title')]/@content\", '//title/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}",NOW(),NOW())
93 # INSERT INTO `dc_sites`.`sites` (`Id`, `UDate`, `TcDate`, `CDate`, `Resources`, `Iterations`, `State`, `Priority`, `MaxURLs`, `MaxResources`, `MaxErrors`, `MaxResourceSize`, `RequestDelay`, `HTTPTimeout`, `ErrorMask`, `Errors`, `URLType`) VALUES ('5dfd57388380f1b8ece0bb133b9a61f7', NULL, NULL, '2014-04-16 14:09:06', '0', '0', '1', '0', '0', '0', '0', '0', '0', '30000', '0', '0', '0');
94 # UPDATE `sites_properties` SET `Value`="{'description': [\"//div[contains(@id,'article-body')]/p[1]/text()\"], 'pubdate': [\"//span[contains(@id,'pub_time')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'article-body')]/figure/div[contains(@class,'figure')]/a/img/@src\"], 'title': [\"//h1[contains(@id,'article-title')]/text()\"], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="5dfd57388380f1b8ece0bb133b9a61f7" AND `Name`="template"
95 # INSERT INTO `sites_properties`(`Site_Id`, `Name`, `Value`, `UDate`, `CDate`) VALUES ("5dfd57388380f1b8ece0bb133b9a61f7","HTTP_HEADERS","User-Agent: Googlebot",NOW(),NOW())
96 
97 siteId6 = str(md5.new("http://www.kyodo.co.jp").hexdigest())
98 urlId6 = str(md5.new("http://www.kyodo.co.jp/release-news/2014-04-07_522084/").hexdigest())
99 bItem6 = BatchItem(siteId6, urlId6)
100 # insert into `urls_e1d2866893129ea29bd1721ec649d64d`(`Site_Id`, `URLMd5`, `URL`) values("e1d2866893129ea29bd1721ec649d64d","d8523df36ea281d193fa07cb4f8f2676", "http://www.kyodo.co.jp/release-news/2014-04-07_522084/")
101 # INSERT INTO `sites_properties`(`Site_Id`, `Name`, `Value`, `UDate`, `CDate`) VALUES ("e1d2866893129ea29bd1721ec649d64d","template","{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//meta[contains(@property,'title')]/@content\", \"//meta[contains(@name,'title')]/@content\", '//title/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}",NOW(),NOW())
102 # INSERT INTO `dc_sites`.`sites` (`Id`, `UDate`, `TcDate`, `CDate`, `Resources`, `Iterations`, `State`, `Priority`, `MaxURLs`, `MaxResources`, `MaxErrors`, `MaxResourceSize`, `RequestDelay`, `HTTPTimeout`, `ErrorMask`, `Errors`, `URLType`) VALUES ('e1d2866893129ea29bd1721ec649d64d', NULL, NULL, '2014-04-16 14:09:06', '0', '0', '1', '0', '0', '0', '0', '0', '0', '30000', '0', '0', '0');
103 # UPDATE `sites_properties` SET `Value`="{'description': [\"//div[contains(@class,'entry-content')]/p[1]/text()\"], 'pubdate': [\"//div[contains(@class,'entry-meta')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//meta[contains(@property,'og:image')]/@content\"], 'title': [\"//h1[contains(@class,'entry-title')]/text()\"], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'entry-content')]/p/text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="e1d2866893129ea29bd1721ec649d64d" AND `Name`="template"
104 # UPDATE `sites_properties` SET `Value`="{'description': [\"//div[contains(@class,'entry-content')]/p[1]/text()\"], 'pubdate': [\"//div[contains(@class,'entry-meta')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//meta[contains(@property,'og:image')]/@content\"], 'title': [\"//h1[contains(@class,'entry-title')]/text()\"], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@id,'Main')]//div[contains(@class,'entry-content')]/p/text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="e1d2866893129ea29bd1721ec649d64d" AND `Name`="template"
105 
106 siteId7 = str(md5.new("http://www3.nhk.or.jp").hexdigest())
107 urlId7 = str(md5.new("http://www3.nhk.or.jp/chihouhatsu/").hexdigest())
108 bItem7 = BatchItem(siteId7, urlId7)
109 # insert into `urls_e1e17e17ff30feab8bfcfc31109fdd0e`(`Site_Id`, `URLMd5`, `URL`) values("e1e17e17ff30feab8bfcfc31109fdd0e","8562cdf2e16e9a48fec084b8b52fc9c8", "http://www4.nhk.or.jp/chihouhatsu/")
110 # INSERT INTO `sites_properties`(`Site_Id`, `Name`, `Value`, `UDate`, `CDate`) VALUES ("5603d05e159f771dd467e1173137f556","template","{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//meta[contains(@property,'title')]/@content\", \"//meta[contains(@name,'title')]/@content\", '//title/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}",NOW(),NOW())
111 # INSERT INTO `dc_sites`.`sites` (`Id`, `UDate`, `TcDate`, `CDate`, `Resources`, `Iterations`, `State`, `Priority`, `MaxURLs`, `MaxResources`, `MaxErrors`, `MaxResourceSize`, `RequestDelay`, `HTTPTimeout`, `ErrorMask`, `Errors`, `URLType`) VALUES ('5603d05e159f771dd467e1173137f556', NULL, NULL, '2014-04-16 14:09:06', '0', '0', '1', '0', '0', '0', '0', '0', '0', '30000', '0', '0', '0');
112 # UPDATE `sites_properties` SET `Value`="{'description': [\"//p[contains(@id,'news_textbody')]/text()\"], 'pubdate': [\"//span[contains(@id,'news_date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//h1[contains(@class,'entryTitle')]/span[contains(@class,'contentTitle')]/text()\"], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//p[contains(@id,'news_textmore')]/text() | //p[contains(@id,'news_textbody')]/text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="e1e17e17ff30feab8bfcfc31109fdd0e" AND `Name`="template"
113 # UPDATE `sites_properties` SET `Value`="{'description': [\"//p[contains(@id,'news_textbody')]/text()\"], 'pubdate': [\"concat(//span[contains(@id,'news_date')]/text(), //span[contains(@id,'news_time')]/text())\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//h1[contains(@class,'entryTitle')]/span[contains(@class,'contentTitle')]/text()\"], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//p[contains(@id,'news_textmore')]/text() | //p[contains(@id,'news_textbody')]/text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="e1e17e17ff30feab8bfcfc31109fdd0e" AND `Name`="template"
114 
115 siteId8 = str(md5.new("http://jp.reuters.com").hexdigest())
116 urlId8 = str(md5.new("http://jp.reuters.com/article/topNews/idJPTYEA3700120140408").hexdigest())
117 bItem8 = BatchItem(siteId8, urlId8)
118 # insert into `urls_f84bb45b91e15bc669d4e1456e03175d`(`Site_Id`, `URLMd5`, `URL`) values("f84bb45b91e15bc669d4e1456e03175d","c5ff32f6e027974e6eb2566226fed3bb", "http://jp.reuters.com/article/topNews/idJPTYEA3700120140408")
119 # INSERT INTO `sites_properties`(`Site_Id`, `Name`, `Value`, `UDate`, `CDate`) VALUES ("f84bb45b91e15bc669d4e1456e03175d","template","{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//meta[contains(@property,'title')]/@content\", \"//meta[contains(@name,'title')]/@content\", '//title/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}",NOW(),NOW())
120 # INSERT INTO `dc_sites`.`sites` (`Id`, `UDate`, `TcDate`, `CDate`, `Resources`, `Iterations`, `State`, `Priority`, `MaxURLs`, `MaxResources`, `MaxErrors`, `MaxResourceSize`, `RequestDelay`, `HTTPTimeout`, `ErrorMask`, `Errors`, `URLType`) VALUES ('f84bb45b91e15bc669d4e1456e03175d', NULL, NULL, '2014-04-16 14:09:06', '0', '0', '1', '0', '0', '0', '0', '0', '0', '30000', '0', '0', '0');
121 # UPDATE `sites_properties` SET `Value`="{'description': [\"///span[contains(@class,'focusParagraph')]/p/text()\"], 'pubdate': [\"//div[contains(@class,'timestampHeader')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//td[contains(@id,'articlePhoto')]//img/@src\"], 'title': [\"///h1/text()\"], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@id,'resizeableText')]//p/text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="f84bb45b91e15bc669d4e1456e03175d" AND `Name`="template"
122 
123 siteId9 = str(md5.new("http://www.nikkei.com").hexdigest())
124 urlId9 = str(md5.new("http://www.nikkei.com/article/DGXNASGM0703U_Y4A400C1MM0000/?dg=1").hexdigest())
125 bItem9 = BatchItem(siteId9, urlId9)
126 # insert into `urls_bdb030e601458942a940a0891e2ed239`(`Site_Id`, `URLMd5`, `URL`) values("bdb030e601458942a940a0891e2ed239","6fb22e58d9a49cf58df045266bb97003", "http://www.nikkei.com/article/DGXNASGM0703U_Y4A400C1MM0000/?dg=1")
127 # INSERT INTO `sites_properties`(`Site_Id`, `Name`, `Value`, `UDate`, `CDate`) VALUES ("bdb030e601458942a940a0891e2ed239","template","{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//meta[contains(@property,'title')]/@content\", \"//meta[contains(@name,'title')]/@content\", '//title/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}",NOW(),NOW())
128 # INSERT INTO `dc_sites`.`sites` (`Id`, `UDate`, `TcDate`, `CDate`, `Resources`, `Iterations`, `State`, `Priority`, `MaxURLs`, `MaxResources`, `MaxErrors`, `MaxResourceSize`, `RequestDelay`, `HTTPTimeout`, `ErrorMask`, `Errors`, `URLType`) VALUES ('bdb030e601458942a940a0891e2ed239', NULL, NULL, '2014-04-16 14:09:06', '0', '0', '1', '0', '0', '0', '0', '0', '0', '30000', '0', '0', '0');
129 # UPDATE `sites_properties` SET `Value`="{'description': [\"//div[contains(@class,'cmn-article_text JSID_key_fonttxt')]/p/text()\"], 'pubdate': [\"//dl[contains(@class,'cmn-article_status cmn-clearfix')]/dd[contains(@class,'cmnc-publish')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@class,'cmnc-figure')]//img/@src\"], 'title': [\"//h4[contains(@class,'cmn-article_title cmn-clearfi')]/span[contains(@class,'mnc-middle JSID_key_fonthln')]/text()\"], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'cmn-article_text JSID_key_fonttxt')]//p/text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="bdb030e601458942a940a0891e2ed239" AND `Name`="template"
130 # UPDATE `sites_properties` SET `Value`="{'description': [\"//div[contains(@class,'cmn-article_text JSID_key_fonttxt')]/p/text()\"], 'pubdate': [\"//dl[contains(@class,'cmn-article_status cmn-clearfix')]/dd[contains(@class,'cmnc-publish')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@class,'cmnc-figure')]//img/@src\"], 'title': [\"//h4[contains(@class,'cmn-article_title cmn-clearfi')]/span[contains(@class,'mnc-middle JSID_key_fonthln')]/text()\", \"//h2[contains(@id,'JSID_title')]//text()\"], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'cmn-article_text JSID_key_fonttxt')]//p/text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="bdb030e601458942a940a0891e2ed239" AND `Name`="template"
131 siteId10 = str(md5.new("http://www.tokyo-np.co.jp").hexdigest())
132 urlId10 = str(md5.new("http://www.tokyo-np.co.jp/s/article/2014042290135558.html").hexdigest())
133 bItem10 = BatchItem(siteId10, urlId10)
134 # insert into `urls_13d19ef5c14d00d9c550692d5ea8d0af`(`Site_Id`, `URLMd5`, `URL`) values("13d19ef5c14d00d9c550692d5ea8d0af","068eaac8720666ec2bc7943e922a318b", "http://www.tokyo-np.co.jp/s/article/2014042290135558.html")
135 # INSERT INTO `sites_properties`(`Site_Id`, `Name`, `Value`, `UDate`, `CDate`) VALUES ("13d19ef5c14d00d9c550692d5ea8d0af","template","{'description': [\"//meta[contains(@name,'description')]/@content\"], 'pubdate': [\"//meta[contains(@name,'date')]/@content\", \"//span[contains(@class,'date')]/text()\", \"//p[contains(@class,'date')]/text()\", \"//div[contains(@id,'date')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//div[contains(@id,'mage')]/img//@src\", \"//div[contains(@class,'Image')]//@href\"], 'title': [\"//meta[contains(@property,'title')]/@content\", \"//meta[contains(@name,'title')]/@content\", '//title/text()'], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]/p//text()\", \"//div[contains(@class,'ext')]/p//text()\", \"//div[contains(@class,'cbox-content row')]/p//text()\", \"//p[contains(@id,'ews')]/text()\", \"//div[contains(@class,'ews')]/p//text()\", \"//meta[contains(@name,'content_encoded')]/@content\", \"//p[contains(@class,'lead-def')]/text()\", \"//span[contains(@id,'article')]/p/text()\", \"//div[contains(@class,'article')]//p//text()\", \"//div[contains(@class,'content')]/p//text()\", \"//div[contains(@id,'rticle')]/p//text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}",NOW(),NOW())
136 # INSERT INTO `dc_sites`.`sites` (`Id`, `UDate`, `TcDate`, `CDate`, `Resources`, `Iterations`, `State`, `Priority`, `MaxURLs`, `MaxResources`, `MaxErrors`, `MaxResourceSize`, `RequestDelay`, `HTTPTimeout`, `ErrorMask`, `Errors`, `URLType`) VALUES ('13d19ef5c14d00d9c550692d5ea8d0af', NULL, NULL, '2014-04-16 14:09:06', '0', '0', '1', '0', '0', '0', '0', '0', '0', '30000', '0', '0', '0');
137 # UPDATE `sites_properties` SET `Value`="{'description': [\"//div[contains(@class,'Text')]//p[1]/text()\"], 'pubdate': [\"//div[contains(@class,'News-headarea')]/p[contains(@class,'data')]/text()\"], 'author': [\"//meta[contains(@name,'author')]/@content\"], 'media': [\"//table[contains(@class,'Photo-undefine')]//img/@src\"], 'title': [\"//div[contains(@class,'News-headarea')]/h1/text()\"], 'media_thumbnail': [\"//meta[contains(@name,'media_thumbnail')]/@content\"], 'media_content': [\"//meta[contains(@name,'media_content')]/@content\"], 'dc_date': [\"//meta[contains(@name,'date')]/@content\"], 'link': [\"//meta[contains(@name,'link')]/@content\", \"//li[contains(@class,'space')]/@href\", \"//link[contains(@rel,'canonical')]/@href\"], 'keywords': [\"//meta[contains(@name,'keywords')]/@content\"], 'content_encoded': [\"//div[contains(@class,'Text')]//p/text()\"], 'guid': [\"//meta[contains(@name,'guid')]/@content\"], 'enclosure': [\"//meta[contains(@name,'enclosure')]/@content\"]}" WHERE `Site_Id`="13d19ef5c14d00d9c550692d5ea8d0af" AND `Name`="template"
138 
139 url_list = [
140  bItem1,
141  bItem2,
142  bItem3,
143  bItem4,
144  bItem5,
145  bItem6,
146  bItem7,
147  bItem8,
148  bItem9,
149  bItem10
150  ]
151 
152 
153 PWD = "cd ../../bin"
154 PYTHON_BINARY = "/usr/bin/python"
155 CRAWLER_TASK_BINARY = "./crawler-task.py"
156 CRAWLER_TASK_CFG = "--config=../ini/crawler-task.ini"
157 PROCESSOR_TASK_BINARY = "./processor-task.py"
158 PROCESSOR_TASK_CFG = "--config=../ini/processor-task.ini"
159 PREPAIRER = "./prepairer.py"
160 JSON_VIEWER = "./scraper_json_viewer.py"
161 
162 Results = namedtuple("Results", "exit_code, output, err")
163 
164 
165 def processFullBatch(input_object):
166  input_pickled_object = pickle.dumps(input_object)
167  # cmd = PWD + " && " +PREPAIRER +" | "+ PYTHON_BINARY + " " + PROCESSOR_TASK_BINARY + " " + PROCESSOR_TASK_CFG + " | " +JSON_VIEWER
168  cmd = PWD + " && " + PYTHON_BINARY + " " + PROCESSOR_TASK_BINARY + " " + PROCESSOR_TASK_CFG
169  # print "cmd: " + cmd
170  process = Popen(cmd, stdout=PIPE, stdin=PIPE, shell=True)
171  (output, err) = process.communicate(input=input_pickled_object)
172  # (output, err) = process.communicate(input="http://www.yomiuri.co.jp/politics/20140422-OYT1T50105.html?from=ycont_top_txt")
173  print output
174  exit_code = process.wait()
175  return Results(exit_code, output, err)
176 
177 
178 if __name__ == "__main__":
179  # create batch object from list of urls
180  input_object = Batch(11, url_list)
181  # TODO main work
182  result = processFullBatch(input_object)
183  # get response object
184  # pylint: disable=E1103
185  print result.output
186  generalResponse = pickle.loads(result.output)
187  # check if all OK
188  # assert generalResponse.errorCode == GeneralResponse.ERROR_OK