From d8170d81d86611f44c190e05fd3a7b5121e2336a Mon Sep 17 00:00:00 2001 From: Bofh Date: Fri, 16 Dec 2022 03:36:14 +0100 Subject: [PATCH] Suppord word sound normalization in regex searches * For example, a single (crypto|nft) regex will make "cripto" or cr1pto also match. --- .../database/mastodon/accounts/search/mod.php | 2 +- base.php | 68 ++++++++++++++++--- js/base.php | 22 +++--- 3 files changed, 70 insertions(+), 22 deletions(-) diff --git a/api/v1/database/mastodon/accounts/search/mod.php b/api/v1/database/mastodon/accounts/search/mod.php index e4dd832..ee4be35 100644 --- a/api/v1/database/mastodon/accounts/search/mod.php +++ b/api/v1/database/mastodon/accounts/search/mod.php @@ -192,7 +192,7 @@ if (isset($_GET['profile']) && trim($_GET['profile']) != '') } } } else if ($qt === 'expr') - $matches = matches_comparing_expression($q, normalize_for_search($a_note)); + $matches = matches_comparing_expression($q, $a_note); if ($matches) { $filtered_accounts []= $account['id']; diff --git a/base.php b/base.php index cc0c392..9723a8b 100644 --- a/base.php +++ b/base.php @@ -378,9 +378,29 @@ if (!function_exists('str_starts_with')) { } } +function set_chat_at($str, $i, $c) { + return substr($str, 0, $i).$c.substr($str, $i+1); +} + +function normalize_word_sound($word, $cback=null) { + $hashes = [ + '4:a', '3:e', '1:i', '0:o', + '5:s', 'b:v', '8:b', 'k:c', + 'y:i', 'que:kee', 'q:k', + ]; + foreach ($hashes as $hash) { + $hash = explode(':', $hash); + if ($cback !== null) + $word = $cback($word, $hash); + else $word = str_replace($hash[0], $hash[1], $word); + } + return $word; +} + function normalize_for_search($str) { if (trim($str) === '') return ''; - $str = strtolower(remove_accents(trim($str))); + $str = strip_tags(trim($str)); + $str = strtolower(remove_accents($str)); $str = preg_replace('/[^a-z0-9]+/', ' ', $str); $str = preg_replace('/\s+/', ' ', $str); $words = explode(' ', trim($str)); @@ -388,15 +408,7 @@ function normalize_for_search($str) { foreach ($words as $word) { if (trim($word) === '') continue; - $word = str_replace('4', 'a', $word); - $word = str_replace('3', 'e', $word); - $word = str_replace('1', 'i', $word); - $word = str_replace('0', 'o', $word); - $word = str_replace('5', 's', $word); - $word = str_replace('b', 'v', $word); - $word = str_replace('8', 'b', $word); - $word = str_replace('k', 'c', $word); - $word = str_replace('que', 'kee', $word); + $word = normalize_word_sound($word); $nword = ''; for ($i = 0; $i < strlen($word); $i++) { if ($i === 0) { @@ -415,9 +427,25 @@ function normalize_for_search($str) { function parse_comparing_expression($expr) { $expr = preg_replace('/\n/', ' ', $expr); $expr = preg_replace('/\s+/', ' ', $expr); - $p_expr = preg_replace('/\(|\)/', ' ', $expr); + $p_expr = $expr; + $quot = false; + for ($i = 0; $i < strlen($p_expr); $i++) { + if ($p_expr[$i] === '"') { + $quot = !$quot; + continue; + } + if ($quot) { + if ($p_expr[$i] === '(') + $p_expr = set_chat_at($p_expr, $i, 'º'); + else if ($p_expr[$i] === ')') + $p_expr = set_chat_at($p_expr, $i, 'ª'); + } + } + $p_expr = preg_replace('/\(|\)/', ' ', $p_expr); $p_expr = preg_replace('/\s+/', ' ', $p_expr); $p_expr = preg_split('/OR|AND(\sNOT)?/', $p_expr); + $p_expr = str_replace('º', '(', $p_expr); + $p_expr = str_replace('ª', ')', $p_expr); foreach ($p_expr as &$e) $e = trim($e); return [ 'original' => $expr, @@ -429,6 +457,7 @@ function matches_comparing_expression($expr, $text) { if (gettype($expr) === 'string') $expr = parse_comparing_expression($expr); $result = $expr['original']; + $text = normalize_for_search($text); $text_words = explode(' ', $text); foreach ($expr['parsed'] as $t) { @@ -478,6 +507,23 @@ function matches_comparing_expression($expr, $text) { case 'matches': if (strlen($content) <= 0) continue 2; + $content = normalize_word_sound($content, function($w, $h) { + if ($h[0] === 'b') + return preg_replace('/([^\\\])b/', '\1'.$h[1], $w); + if (preg_match('/^\d$/', $h[0])) { + $b = false; + for ($i = 0; $i < strlen($w); $i++) { + if (in_array($w[$i], ['{','}','[',']'])) { + $b = !$b; + continue; + } + if (!$b && $w[$i] === $h[0]) + $w = set_chat_at($w, $i, $h[1]); + } + return $w; + } + return str_replace($h[0], $h[1], $w); + }); if ($content[0] != '^') $content = '^.*'.$content; if ($content[strlen($content)-1] != '$') diff --git a/js/base.php b/js/base.php index f54f1c3..d4db603 100644 --- a/js/base.php +++ b/js/base.php @@ -139,16 +139,18 @@ function insert_string(main_string, ins_string, pos) { return main_string.slice(0, pos) + ins_string + main_string.slice(pos); } -function normalize_word_sound(word) { - word = word.replaceAll('4','a'); - word = word.replaceAll('3','e'); - word = word.replaceAll('1','i'); - word = word.replaceAll('0','o'); - word = word.replaceAll('5','s'); - word = word.replaceAll('b','v'); - word = word.replaceAll('8','b'); - word = word.replaceAll('k','c'); - word = word.replaceAll('que','kee'); +function normalize_word_sound(word, cback) { + const hashes = [ + '4:a', '3:e', '1:i', '0:o', + '5:s', 'b:v', '8:b', 'k:c', + 'y:i', 'que:kee', 'q:k', + ]; + for (var i = 0; i < hashes.length; i++) { + const hash = hashes[i].split(':'); + if (cback !== undefined) + word = cback(word, hash); + else word = word.replaceAll(hash[0], hash[1]); + } return word; }