2 namespace TYPO3\CMS\IndexedSearch;
51 'printjoins' => array(46, 45, 95, 58, 47, 39),
52 'casesensitive' =>
false,
54 'removeChars' => array(45)
63 $this->csObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance(\TYPO3\CMS\Core\Charset\CharsetConverter::class);
76 $this->debugString =
'';
78 if (!$this->lexerConf[
'casesensitive']) {
79 $wordString = $this->csObj->conv_case(
'utf-8', $wordString,
'toLower');
86 $this->debugString =
'';
88 list($start, $len) = $this->
get_word($wordString, $pos);
90 $this->
addWords($words, $wordString, $start, $len);
92 $this->debugString .=
'<span style="color:red">' . htmlspecialchars(substr($wordString, $pos, ($start - $pos))) .
'</span>' . htmlspecialchars(substr($wordString, $start, $len));
117 public function addWords(&$words, &$wordString, $start, $len)
120 $theWord = substr($wordString, $start, $len);
123 $cp = $this->
utf8_ord($theWord, $bc);
124 list($cType) = $this->
charType($cp);
137 if ($cType ==
'cjk') {
139 $strlen = $this->csObj->utf8_strlen($theWord);
141 for ($a = 0; $a < $strlen; $a++) {
142 if ($strlen == 1 || $a < $strlen - 1) {
143 $words[] = $this->csObj->utf8_substr($theWord, $a, 2);
149 foreach ($this->lexerConf[
'removeChars'] as $skipJoin) {
150 $theWord = str_replace($this->csObj->UnumberToChar($skipJoin),
'', $theWord);
169 return array($pos, $len);
173 if ($str[$pos] ==
'') {
178 return array($pos, $len);
195 $cType = ($cType_prev =
false);
199 if ($str[$pos] ==
'') {
208 if (!$cType || $cType_prev ==
'cjk' && \TYPO3\CMS\Core\Utility\
GeneralUtility::inList(
'num,alpha', $cType) || $cType ==
'cjk' && \TYPO3\CMS\Core\Utility\GeneralUtility::inList(
'num,alpha', $cType_prev)) {
210 if (!in_array($cp, $this->lexerConf[
'printjoins'])) {
213 $len = $printJoinLgd;
218 if (!$printJoinLgd) {
219 $printJoinLgd = $len;
226 }
elseif (!$letter && $cType) {
233 if ($str[$pos] ==
'') {
238 $cp = $this->
utf8_ord($str, $bc, $pos);
241 $cType_prev = $cType;
242 list($cType) = $this->
charType($cp);
263 if ($cp >= 48 && $cp <= 57) {
267 if ($cp >= 65 && $cp <= 90 || $cp >= 97 && $cp <= 122 || $cp >= 192 && $cp <= 255 && $cp != 215 && $cp != 247 || $cp >= 256 && $cp < 640 || ($cp == 902 || $cp >= 904 && $cp < 1024) || ($cp >= 1024 && $cp < 1154 || $cp >= 1162 && $cp < 1328) || ($cp >= 1424 && $cp < 1456 || $cp >= 1488 && $cp < 1523) || ($cp >= 1569 && $cp <= 1624 || $cp >= 1646 && $cp <= 1747) || $cp >= 7680 && $cp < 8192) {
268 return array(
'alpha');
273 if ($cp >= 12352 && $cp <= 12543 || $cp >= 12592 && $cp <= 12687 || $cp >= 13312 && $cp <= 19903 || $cp >= 19968 && $cp <= 40879 || $cp >= 44032 && $cp <= 55215 || $cp >= 131072 && $cp <= 195103) {
287 public function utf8_ord(&$str, &$len, $pos = 0, $hex =
false)
289 $ord = ord($str[$pos]);
292 for ($bc = -1, $mbs = $ord; $mbs & 128; $mbs = $mbs << 1) {
297 $ord = $ord & (1 << 6 - $bc) - 1;
300 for ($i = $pos + 1; $bc; $bc--, $i++) {
301 $ord = $ord << 6 | ord($str[$i]) & 63;
304 return $hex ?
'x' . dechex($ord) : $ord;