Suppord word sound normalization in regex searches

* For example, a single (crypto|nft) regex will make "cripto" or cr1pto also match.
2022-12-16 03:36:14 +01:00 · 2022-12-16 03:36:14 +01:00 · d8170d81d8
parent 05b8d6e6da
commit d8170d81d8
3 changed files with 70 additions and 22 deletions
--- a/api/v1/database/mastodon/accounts/search/mod.php
+++ b/api/v1/database/mastodon/accounts/search/mod.php
@ -192,7 +192,7 @@ if (isset($_GET['profile']) && trim($_GET['profile']) != '')
 				}
 			}
 		} else if ($qt === 'expr')
-			$matches = matches_comparing_expression($q, normalize_for_search($a_note));
+			$matches = matches_comparing_expression($q, $a_note);

 		if ($matches) {
 			$filtered_accounts []= $account['id'];
--- a/base.php
+++ b/base.php
@ -378,9 +378,29 @@ if (!function_exists('str_starts_with')) {
 	}
 }

+function set_chat_at($str, $i, $c) {
+	return substr($str, 0, $i).$c.substr($str, $i+1);
+}
+
+function normalize_word_sound($word, $cback=null) {
+	$hashes = [
+		'4:a', '3:e', '1:i', '0:o',
+		'5:s', 'b:v', '8:b', 'k:c',
+		'y:i', 'que:kee', 'q:k',
+	];
+	foreach ($hashes as $hash) {
+		$hash = explode(':', $hash);
+		if ($cback !== null)
+			$word = $cback($word, $hash);
+		else $word = str_replace($hash[0], $hash[1], $word);
+	}
+	return $word;
+}
+
 function normalize_for_search($str) {
 	if (trim($str) === '') return '';
-	$str = strtolower(remove_accents(trim($str)));
+	$str = strip_tags(trim($str));
+	$str = strtolower(remove_accents($str));
 	$str = preg_replace('/[^a-z0-9]+/', ' ', $str);
 	$str = preg_replace('/\s+/', ' ', $str);
 	$words = explode(' ', trim($str));
@ -388,15 +408,7 @@ function normalize_for_search($str) {
 	foreach ($words as $word) {
 		if (trim($word) === '')
 			continue;
-		$word = str_replace('4', 'a', $word);
-		$word = str_replace('3', 'e', $word);
-		$word = str_replace('1', 'i', $word);
-		$word = str_replace('0', 'o', $word);
-		$word = str_replace('5', 's', $word);
-		$word = str_replace('b', 'v', $word);
-		$word = str_replace('8', 'b', $word);
-		$word = str_replace('k', 'c', $word);
-		$word = str_replace('que', 'kee', $word);
+		$word = normalize_word_sound($word);
 		$nword = '';
 		for ($i = 0; $i < strlen($word); $i++) {
 			if ($i === 0) {
@ -415,9 +427,25 @@ function normalize_for_search($str) {
 function parse_comparing_expression($expr) {
 	$expr = preg_replace('/\n/', ' ', $expr);
 	$expr = preg_replace('/\s+/', ' ', $expr);
-	$p_expr = preg_replace('/\(|\)/', ' ', $expr);
+	$p_expr = $expr;
+	$quot = false;
+	for ($i = 0; $i < strlen($p_expr); $i++) {
+		if ($p_expr[$i] === '"') {
+			$quot = !$quot;
+			continue;
+		}
+		if ($quot) {
+			if ($p_expr[$i] === '(')
+				$p_expr = set_chat_at($p_expr, $i, 'º');
+			else if ($p_expr[$i] === ')')
+				$p_expr = set_chat_at($p_expr, $i, 'ª');
+		}
+	}
+	$p_expr = preg_replace('/\(|\)/', ' ', $p_expr);
 	$p_expr = preg_replace('/\s+/', ' ', $p_expr);
 	$p_expr = preg_split('/OR|AND(\sNOT)?/', $p_expr);
+	$p_expr = str_replace('º', '(', $p_expr);
+	$p_expr = str_replace('ª', ')', $p_expr);
 	foreach ($p_expr as &$e) $e = trim($e);
 	return [
 		'original' => $expr,
@ -429,6 +457,7 @@ function matches_comparing_expression($expr, $text) {
 	if (gettype($expr) === 'string')
 		$expr = parse_comparing_expression($expr);
 	$result = $expr['original'];
+	$text = normalize_for_search($text);
 	$text_words = explode(' ', $text);
 	foreach ($expr['parsed'] as $t)
 	{
@ -478,6 +507,23 @@ function matches_comparing_expression($expr, $text) {
 			case 'matches':
 				if (strlen($content) <= 0)
 					continue 2;
+				$content = normalize_word_sound($content, function($w, $h) {
+					if ($h[0] === 'b')
+						return preg_replace('/([^\\\])b/', '\1'.$h[1], $w);
+					if (preg_match('/^\d$/', $h[0])) {
+						$b = false;
+						for ($i = 0; $i < strlen($w); $i++) {
+							if (in_array($w[$i], ['{','}','[',']'])) {
+								$b = !$b;
+								continue;
+							}
+							if (!$b && $w[$i] === $h[0])
+								$w = set_chat_at($w, $i, $h[1]);
+						}
+						return $w;
+					}
+					return str_replace($h[0], $h[1], $w);
+				});
 				if ($content[0] != '^')
 					$content = '^.*'.$content;
 				if ($content[strlen($content)-1] != '$')
--- a/js/base.php
+++ b/js/base.php
@ -139,16 +139,18 @@ function insert_string(main_string, ins_string, pos) {
 	return main_string.slice(0, pos) + ins_string + main_string.slice(pos);
 }

-function normalize_word_sound(word) {
-	word = word.replaceAll('4','a');
-	word = word.replaceAll('3','e');
-	word = word.replaceAll('1','i');
-	word = word.replaceAll('0','o');
-	word = word.replaceAll('5','s');
-	word = word.replaceAll('b','v');
-	word = word.replaceAll('8','b');
-	word = word.replaceAll('k','c');
-	word = word.replaceAll('que','kee');
+function normalize_word_sound(word, cback) {
+	const hashes = [
+		'4:a', '3:e', '1:i', '0:o',
+		'5:s', 'b:v', '8:b', 'k:c',
+		'y:i', 'que:kee', 'q:k',
+	];
+	for (var i = 0; i < hashes.length; i++) {
+		const hash = hashes[i].split(':');
+		if (cback !== undefined)
+			word = cback(word, hash);
+		else word = word.replaceAll(hash[0], hash[1]);
+	}
 	return word;
 }