highlighter application  1.1
HCE project utils : highlighter
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
HighlightingAlgorithmRefine.cpp
Go to the documentation of this file.
1 
14 #include "Defs.hpp"
16 
17 // cppcheck-suppress unusedFunction
18 void HighlightingAlgorithmRefine::setWordPositionMap(std::map<unsigned int, std::pair<unsigned long long, WordPos> > &_wordPositionMap)
19 {
20  wordPositionMap = _wordPositionMap;
21 }
22 
23 void HighlightingAlgorithmRefine::fillWordOffsetMap(Poco::SharedPtr<HCE::OutDataRefine> outData, SearchType searchType,
24  std::map<unsigned long long, std::vector<WordPos> > &wordMapPtr, std::map<unsigned int, WordPos> &wordOffsetMap)
25 {
26  if(!outData.isNull())
27  {
28  CRC64 crc64;
29  typename std::map<unsigned long long, std::vector<WordPos> >::iterator it;
30  unsigned long long key = 0llu;
31  std::vector<HCE::CWords> cwords = outData->getCWords();
32  for(unsigned int i = 0; i < cwords.size(); i++)
33  {
34  if(cwords[i].getWordType() != HCE::WORD_TYPE::WORD && cwords[i].getWordType() != HCE::WORD_TYPE::NUMBER)
35  {
36  continue;
37  }
38  key = crc64.calc(cwords[i].getNormWord().c_str(), cwords[i].getNormWord().length());
39  if((it = wordMapPtr.find(key)) != wordMapPtr.end())
40  {
41  for(unsigned int y = 0; y < (*it).second.size(); y++)
42  {
43  if(searchType == SearchType::ST_SINGLE)
44  {
45  incrementalIndex++;
46  (*it).second[y].segmentId = incrementalIndex;
47  wordOffsetMap.insert(std::pair<unsigned int, WordPos>((*it).second[y].beginOffset, (*it).second[y]));
48  }
49  else if(searchType == SearchType::ST_STRICKT || searchType == SearchType::ST_NO_STRICKT)
50  {
51  wordPositionMap.insert(std::pair<unsigned int, std::pair<unsigned long long, WordPos> >
52  ((*it).second[y].position, std::pair<unsigned long long, WordPos>((*it).first, (*it).second[y])));
53  }
54  }
55  }
56  }
57  }
58 }
59 
60 void HighlightingAlgorithmRefine::findChain(Poco::SharedPtr<HCE::OutDataRefine> outData, std::map<unsigned int, WordPos> &wordOffsetMap)
61 {
62  if(!outData.isNull())
63  {
64  CRC64 crc64;
65  NotStricktChain notStricktChain;
66  typename std::map<unsigned int, std::pair<unsigned long long, WordPos> >::iterator tempIt;
67  std::vector<HCE::CWords> cwords = outData->getCWords();
68  for(unsigned int i = 0; i < cwords.size(); i++)
69  {
70  if(cwords[i].getWordType() != HCE::WORD_TYPE::WORD && cwords[i].getWordType() != HCE::WORD_TYPE::NUMBER)
71  {
72  continue;
73  }
74  notStricktChain.addWord(crc64.calc(cwords[i].getNormWord().c_str(), cwords[i].getNormWord().length()));
75  }
76  if(notStricktChain.wCount() > 0)
77  {
78  auto prevIt = wordPositionMap.begin();
79  auto startChainIt = wordPositionMap.begin();
80 // cppcheck-suppress variableScope
81  bool jump = false;
82  for(auto it = wordPositionMap.begin(); it != wordPositionMap.end(); ++it)
83  {
84  if((*it).first != ((*prevIt).first + 1) || jump)
85  {
86  notStricktChain.setDefaultCounts();
87  startChainIt = it;
88  jump = false;
89  }
90  notStricktChain.decrCount((*it).second.first);
91  if(notStricktChain.checkOnZerro())
92  {
93  incrementalIndex++;
94  for(tempIt = startChainIt; tempIt != it; ++tempIt)
95  {
96  (*tempIt).second.second.segmentId = incrementalIndex;
97  wordOffsetMap.insert(std::pair<unsigned int, WordPos>((*tempIt).second.second.beginOffset, (*tempIt).second.second));
98  }
99  (*it).second.second.segmentId = incrementalIndex;
100  wordOffsetMap.insert(std::pair<unsigned int, WordPos>((*it).second.second.beginOffset, (*it).second.second));
101  jump = true;
102  it = startChainIt;
103  }
104  prevIt = it;
105  }
106  }
107  }
108 }
109 
110 void HighlightingAlgorithmRefine::findChainStrict(Poco::SharedPtr<HCE::OutDataRefine> outData, std::map<unsigned int, WordPos> &wordOffsetMap)
111 {
112  if(!outData.isNull())
113  {
114  CRC64 crc64;
115  typename std::map<unsigned int, std::pair<unsigned long long, WordPos> >::iterator localIt;
116  std::vector<HCE::CWords> cwords = outData->getCWords();
117  std::vector<unsigned long long> crcVec;
118  for(unsigned int i = 0; i < cwords.size(); i++)
119  {
120  if(cwords[i].getWordType() != HCE::WORD_TYPE::WORD && cwords[i].getWordType() != HCE::WORD_TYPE::NUMBER)
121  {
122  continue;
123  }
124  crcVec.push_back(crc64.calc(cwords[i].getNormWord().c_str(), cwords[i].getNormWord().length()));
125  }
126  if(crcVec.size() > 0)
127  {
128 // cppcheck-suppress variableScope
129  unsigned int prevPos = 0;
130 // cppcheck-suppress variableScope
131  bool isChain = false;
132  for(auto it = wordPositionMap.begin(); it != wordPositionMap.end(); ++it)
133  {
134  if((*it).second.first == crcVec[0])
135  {
136  isChain = true;
137  prevPos = (*it).first;
138  localIt = it;
139  localIt++;
140  for(unsigned int i = 1; i < crcVec.size(); i++)
141  {
142  if(localIt == wordPositionMap.end() || ((*localIt).first != prevPos + 1) || ((*localIt).second.first != crcVec[i]))
143  {
144  isChain = false;
145  break;
146  }
147  prevPos = (*localIt).first;
148  localIt++;
149  }
150  if(isChain)
151  {
152  localIt = it;
153  for(unsigned int i = 0; i < crcVec.size(); i++)
154  {
155  incrementalIndex++;
156  (*localIt).second.second.segmentId = incrementalIndex;
157  wordOffsetMap.insert(std::pair<unsigned int, WordPos>((*localIt).second.second.beginOffset, (*localIt).second.second));
158  localIt++;
159  }
160  }
161  }
162  }
163  }
164  }
165 }
166 
167 void HighlightingAlgorithmRefine::joinBySegmentId(std::map<unsigned int, WordPos> &wordOffsetMap)
168 {
169  unsigned int localSegmentId = 0xFFFFFFFF;
170  std::map<unsigned int, WordPos> localWordOffsetMap;
171  std::pair<std::map<unsigned int, WordPos>::iterator, bool> insertIt;
172  for(auto it = wordOffsetMap.begin(); it != wordOffsetMap.end(); ++it)
173  {
174  if(it->second.segmentId == localSegmentId)
175  {
176  insertIt.first->second.endOffset = it->second.endOffset;
177  }
178  else
179  {
180  insertIt = localWordOffsetMap.insert(std::pair<unsigned int, WordPos>(it->first, it->second));
181  }
182  localSegmentId = it->second.segmentId;
183  }
184  wordOffsetMap = localWordOffsetMap;
185 }
186 
187 void HighlightingAlgorithmRefine::process(const std::vector<std::pair<SearchType, std::string> > &searchStrings, const Poco::SharedPtr<std::string> contentPtr,
188  ContentsStorageBase::WordPosPtrType wordMapPtr, std::map<unsigned int, WordPos> &wordOffsetMap)
189 {
190  if(wordMapPtr.isNull() || wordMapPtr->empty())
191  {
192  return;
193  }
194  wordPositionMap.clear();
195  for(unsigned int i = 0; i < searchStrings.size(); i++)
196  {
197  if(!searchStrings[i].second.empty() && searchStrings[i].second.length() > 0)
198  {
199  std::map<unsigned long long, std::vector<WordPos> > &localWordPosMap = (searchStrings[i].first == SearchType::ST_SINGLE ? (*wordMapPtr)[1] : (*wordMapPtr)[0]);
200  unsigned int normalizationId = (searchStrings[i].first == SearchType::ST_SINGLE ? DEFAULT_NORMALIZATION_WITH : DEFAULT_NORMALIZATION_WITHOUT);
201  Poco::SharedPtr<HCE::DataBase> inData = nullptr;
202  Poco::SharedPtr<HCE::DataBase> outDataFindStr = nullptr;
203  inData = Poco::SharedPtr<HCE::DataBase>(new HCE::InDataRefine(HCE::CT_REFINE));
204  inData.cast<HCE::InDataRefine>()->setNormalizationId(normalizationId);
205  inData.cast<HCE::InDataRefine>()->setSubjectMask(subjectMask);
206  inData.cast<HCE::InDataRefine>()->setContent(searchStrings[i].second);
207  outDataFindStr = refine.process(inData);
208  fillWordOffsetMap(outDataFindStr.cast<HCE::OutDataRefine>(), searchStrings[i].first, localWordPosMap, wordOffsetMap);
209  if(searchStrings[i].first == SearchType::ST_STRICKT)
210  {
211  findChainStrict(outDataFindStr.cast<HCE::OutDataRefine>(), wordOffsetMap);
212  }
213  else if(searchStrings[i].first == SearchType::ST_NO_STRICKT)
214  {
215  findChain(outDataFindStr.cast<HCE::OutDataRefine>(), wordOffsetMap);
216  }
217  }
218  }
219  joinBySegmentId(wordOffsetMap);
220 }