Suppord word sound normalization in regex searches

* For example, a single (crypto|nft) regex will make "cripto" or cr1pto also match.
This commit is contained in:
Bofh 2022-12-16 03:36:14 +01:00
parent 05b8d6e6da
commit d8170d81d8
3 changed files with 70 additions and 22 deletions

View File

@ -192,7 +192,7 @@ if (isset($_GET['profile']) && trim($_GET['profile']) != '')
}
}
} else if ($qt === 'expr')
$matches = matches_comparing_expression($q, normalize_for_search($a_note));
$matches = matches_comparing_expression($q, $a_note);
if ($matches) {
$filtered_accounts []= $account['id'];

View File

@ -378,9 +378,29 @@ if (!function_exists('str_starts_with')) {
}
}
function set_chat_at($str, $i, $c) {
return substr($str, 0, $i).$c.substr($str, $i+1);
}
function normalize_word_sound($word, $cback=null) {
$hashes = [
'4:a', '3:e', '1:i', '0:o',
'5:s', 'b:v', '8:b', 'k:c',
'y:i', 'que:kee', 'q:k',
];
foreach ($hashes as $hash) {
$hash = explode(':', $hash);
if ($cback !== null)
$word = $cback($word, $hash);
else $word = str_replace($hash[0], $hash[1], $word);
}
return $word;
}
function normalize_for_search($str) {
if (trim($str) === '') return '';
$str = strtolower(remove_accents(trim($str)));
$str = strip_tags(trim($str));
$str = strtolower(remove_accents($str));
$str = preg_replace('/[^a-z0-9]+/', ' ', $str);
$str = preg_replace('/\s+/', ' ', $str);
$words = explode(' ', trim($str));
@ -388,15 +408,7 @@ function normalize_for_search($str) {
foreach ($words as $word) {
if (trim($word) === '')
continue;
$word = str_replace('4', 'a', $word);
$word = str_replace('3', 'e', $word);
$word = str_replace('1', 'i', $word);
$word = str_replace('0', 'o', $word);
$word = str_replace('5', 's', $word);
$word = str_replace('b', 'v', $word);
$word = str_replace('8', 'b', $word);
$word = str_replace('k', 'c', $word);
$word = str_replace('que', 'kee', $word);
$word = normalize_word_sound($word);
$nword = '';
for ($i = 0; $i < strlen($word); $i++) {
if ($i === 0) {
@ -415,9 +427,25 @@ function normalize_for_search($str) {
function parse_comparing_expression($expr) {
$expr = preg_replace('/\n/', ' ', $expr);
$expr = preg_replace('/\s+/', ' ', $expr);
$p_expr = preg_replace('/\(|\)/', ' ', $expr);
$p_expr = $expr;
$quot = false;
for ($i = 0; $i < strlen($p_expr); $i++) {
if ($p_expr[$i] === '"') {
$quot = !$quot;
continue;
}
if ($quot) {
if ($p_expr[$i] === '(')
$p_expr = set_chat_at($p_expr, $i, 'º');
else if ($p_expr[$i] === ')')
$p_expr = set_chat_at($p_expr, $i, 'ª');
}
}
$p_expr = preg_replace('/\(|\)/', ' ', $p_expr);
$p_expr = preg_replace('/\s+/', ' ', $p_expr);
$p_expr = preg_split('/OR|AND(\sNOT)?/', $p_expr);
$p_expr = str_replace('º', '(', $p_expr);
$p_expr = str_replace('ª', ')', $p_expr);
foreach ($p_expr as &$e) $e = trim($e);
return [
'original' => $expr,
@ -429,6 +457,7 @@ function matches_comparing_expression($expr, $text) {
if (gettype($expr) === 'string')
$expr = parse_comparing_expression($expr);
$result = $expr['original'];
$text = normalize_for_search($text);
$text_words = explode(' ', $text);
foreach ($expr['parsed'] as $t)
{
@ -478,6 +507,23 @@ function matches_comparing_expression($expr, $text) {
case 'matches':
if (strlen($content) <= 0)
continue 2;
$content = normalize_word_sound($content, function($w, $h) {
if ($h[0] === 'b')
return preg_replace('/([^\\\])b/', '\1'.$h[1], $w);
if (preg_match('/^\d$/', $h[0])) {
$b = false;
for ($i = 0; $i < strlen($w); $i++) {
if (in_array($w[$i], ['{','}','[',']'])) {
$b = !$b;
continue;
}
if (!$b && $w[$i] === $h[0])
$w = set_chat_at($w, $i, $h[1]);
}
return $w;
}
return str_replace($h[0], $h[1], $w);
});
if ($content[0] != '^')
$content = '^.*'.$content;
if ($content[strlen($content)-1] != '$')

View File

@ -139,16 +139,18 @@ function insert_string(main_string, ins_string, pos) {
return main_string.slice(0, pos) + ins_string + main_string.slice(pos);
}
function normalize_word_sound(word) {
word = word.replaceAll('4','a');
word = word.replaceAll('3','e');
word = word.replaceAll('1','i');
word = word.replaceAll('0','o');
word = word.replaceAll('5','s');
word = word.replaceAll('b','v');
word = word.replaceAll('8','b');
word = word.replaceAll('k','c');
word = word.replaceAll('que','kee');
function normalize_word_sound(word, cback) {
const hashes = [
'4:a', '3:e', '1:i', '0:o',
'5:s', 'b:v', '8:b', 'k:c',
'y:i', 'que:kee', 'q:k',
];
for (var i = 0; i < hashes.length; i++) {
const hash = hashes[i].split(':');
if (cback !== undefined)
word = cback(word, hash);
else word = word.replaceAll(hash[0], hash[1]);
}
return word;
}