HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
ftest_getSentencesString.py
Go to the documentation of this file.
1 #coding: utf-8
2 
3 import re
4 from string import punctuation
5 
6 
10 def getWordsCount(string, method=0):
11  ret = 0
12 
13  if method == 0:
14  r = re.compile(r'[{}]'.format(punctuation))
15  new_strs = r.sub(' ', string)
16  ret = len(new_strs.split())
17  elif method == 1:
18  ret = len(re.findall(r'\w+', string))
19  else:
20  ret = len(string.split())
21 
22  return ret
23 
24 
25 
31 def getSentencesString(tagValue, maxSentences=1, maxWordsTotal=0):
32  ret = tagValue
33 
34  sDelimChars = ['.', '!', '?']
35  entrances = 0
36  pos = 0
37  while True:
38  for sDelimChar in sDelimChars:
39  pos = tagValue.find(sDelimChar, pos + 1)
40  if pos != -1:
41  entrances += 1
42  break
43  if pos == -1 or (pos != -1 and entrances >= maxSentences) or ((pos + 1) >= len(tagValue)):
44  break
45 
46  if pos != -1 and pos < len(tagValue):
47  ret = tagValue[:pos + 1]
48 
49  if maxWordsTotal > 0:
50  wc = getWordsCount(ret)
51  if wc > maxWordsTotal:
52  pos = 0
53  posRes = 0
54  wc = 0
55  while True:
56  pos = ret.find(' ', pos + 1)
57  if (pos != -1) and (wc < maxWordsTotal) and ((pos + 1) < len(ret)):
58  wc += 1
59  posRes = pos
60  else:
61  break
62  if posRes != -1:
63  ret = ret[:posRes]
64 
65  return ret
66 
67 ss = ["",
68  " ",
69  ".",
70  "..",
71  ". .",
72  " . .",
73  "The test sentence1. The sentence2. The sentence 3..",
74  ".The test sentence1. The sentence2. The sentence 3..",
75  " . The test sentence1. The sentence2. The sentence 3..",
76  "Thetestsentence1Thesentence2Thesentence",
77  ]
78 for s in ss:
79  print '{' + s + '}'
80  print '[' + getSentencesString(s, 2, 4) + ']'
81  print '--------------'
82 
def getSentencesString(tagValue, maxSentences=1, maxWordsTotal=0)
Get sentences from content.
def getWordsCount(string, method=0)
Get words count in string with different methods.