337 lines
9.9 KiB
PHP
337 lines
9.9 KiB
PHP
|
<?php
|
||
|
|
||
|
/**
|
||
|
* Simple Machines Forum (SMF)
|
||
|
*
|
||
|
* @package SMF
|
||
|
* @author Simple Machines https://www.simplemachines.org
|
||
|
* @copyright 2023 Simple Machines and individual contributors
|
||
|
* @license https://www.simplemachines.org/about/smf/license.php BSD
|
||
|
*
|
||
|
* @version 2.1.4
|
||
|
*/
|
||
|
|
||
|
if (!defined('SMF'))
|
||
|
die('No direct access...');
|
||
|
|
||
|
/**
|
||
|
* Used for the "custom search index" option
|
||
|
* Class custom_search
|
||
|
*/
|
||
|
class custom_search extends search_api
|
||
|
{
|
||
|
/**
|
||
|
* @var array Index settings
|
||
|
*/
|
||
|
protected $indexSettings = array();
|
||
|
|
||
|
/**
|
||
|
* @var array An array of banned words
|
||
|
*/
|
||
|
protected $bannedWords = array();
|
||
|
|
||
|
/**
|
||
|
* @var int|null Minimum word length (null for no minimum)
|
||
|
*/
|
||
|
protected $min_word_length = null;
|
||
|
|
||
|
/**
|
||
|
* @var array Which databases support this method
|
||
|
*/
|
||
|
protected $supported_databases = array('mysql', 'postgresql');
|
||
|
|
||
|
/**
|
||
|
* Constructor function
|
||
|
*/
|
||
|
public function __construct()
|
||
|
{
|
||
|
global $smcFunc, $modSettings, $db_type;
|
||
|
|
||
|
// Is this database supported?
|
||
|
if (!in_array($db_type, $this->supported_databases))
|
||
|
{
|
||
|
$this->is_supported = false;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if (empty($modSettings['search_custom_index_config']))
|
||
|
return;
|
||
|
|
||
|
$this->indexSettings = $smcFunc['json_decode']($modSettings['search_custom_index_config'], true);
|
||
|
|
||
|
$this->bannedWords = empty($modSettings['search_stopwords']) ? array() : explode(',', $modSettings['search_stopwords']);
|
||
|
$this->min_word_length = $this->indexSettings['bytes_per_word'];
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* {@inheritDoc}
|
||
|
*/
|
||
|
public function supportsMethod($methodName, $query_params = null)
|
||
|
{
|
||
|
$return = false;
|
||
|
switch ($methodName)
|
||
|
{
|
||
|
case 'isValid':
|
||
|
case 'searchSort':
|
||
|
case 'prepareIndexes':
|
||
|
case 'indexedWordQuery':
|
||
|
case 'postCreated':
|
||
|
case 'postModified':
|
||
|
$return = true;
|
||
|
break;
|
||
|
|
||
|
// All other methods, too bad dunno you.
|
||
|
default:
|
||
|
$return = false;
|
||
|
}
|
||
|
|
||
|
// Maybe parent got support
|
||
|
if (!$return)
|
||
|
$return = parent::supportsMethod($methodName, $query_params);
|
||
|
|
||
|
return $return;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* {@inheritDoc}
|
||
|
*/
|
||
|
public function isValid()
|
||
|
{
|
||
|
global $modSettings;
|
||
|
|
||
|
return !empty($modSettings['search_custom_index_config']);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* {@inheritDoc}
|
||
|
*/
|
||
|
public function searchSort($a, $b)
|
||
|
{
|
||
|
global $excludedWords;
|
||
|
|
||
|
$x = strlen($a) - (in_array($a, $excludedWords) ? 1000 : 0);
|
||
|
$y = strlen($b) - (in_array($b, $excludedWords) ? 1000 : 0);
|
||
|
|
||
|
return $y < $x ? 1 : ($y > $x ? -1 : 0);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* {@inheritDoc}
|
||
|
*/
|
||
|
public function prepareIndexes($word, array &$wordsSearch, array &$wordsExclude, $isExcluded)
|
||
|
{
|
||
|
global $modSettings, $smcFunc;
|
||
|
|
||
|
$subwords = text2words($word, $this->min_word_length, true);
|
||
|
|
||
|
if (empty($modSettings['search_force_index']))
|
||
|
$wordsSearch['words'][] = $word;
|
||
|
|
||
|
// Excluded phrases don't benefit from being split into subwords.
|
||
|
if (count($subwords) > 1 && $isExcluded)
|
||
|
return;
|
||
|
else
|
||
|
{
|
||
|
foreach ($subwords as $subword)
|
||
|
{
|
||
|
if ($smcFunc['strlen']($subword) >= $this->min_word_length && !in_array($subword, $this->bannedWords))
|
||
|
{
|
||
|
$wordsSearch['indexed_words'][] = $subword;
|
||
|
if ($isExcluded)
|
||
|
$wordsExclude[] = $subword;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* {@inheritDoc}
|
||
|
*/
|
||
|
public function indexedWordQuery(array $words, array $search_data)
|
||
|
{
|
||
|
global $modSettings, $smcFunc;
|
||
|
|
||
|
// Specify the function to search with. Regex is for word boundaries.
|
||
|
$is_search_regex = !empty($modSettings['search_match_words']) && !$search_data['no_regexp'];
|
||
|
$query_match_type = $is_search_regex ? 'RLIKE' : 'LIKE';
|
||
|
$word_boundary_wrapper = function(string $str) use ($smcFunc): string
|
||
|
{
|
||
|
return sprintf($smcFunc['db_supports_pcre'] ? '\\b%s\\b' : '[[:<:]]%s[[:>:]]', $str);
|
||
|
};
|
||
|
$escape_sql_regex = function(string $str): string
|
||
|
{
|
||
|
return addcslashes(preg_replace('/[\[\]$.+*?&^|{}()]/', '[$0]', $str), '\\\'');
|
||
|
};
|
||
|
|
||
|
$query_select = array(
|
||
|
'id_msg' => 'm.id_msg',
|
||
|
);
|
||
|
$query_inner_join = array();
|
||
|
$query_left_join = array();
|
||
|
$query_where = array();
|
||
|
$query_params = $search_data['params'];
|
||
|
|
||
|
if ($query_params['id_search'])
|
||
|
$query_select['id_search'] = '{int:id_search}';
|
||
|
|
||
|
$count = 0;
|
||
|
foreach ($words['words'] as $regularWord)
|
||
|
{
|
||
|
if (in_array($regularWord, $query_params['excluded_words']))
|
||
|
$query_where[] = 'm.body NOT ' . $query_match_type . ' {string:complex_body_' . $count . '}';
|
||
|
else
|
||
|
$query_where[] = 'm.body ' . $query_match_type . ' {string:complex_body_' . $count . '}';
|
||
|
|
||
|
if ($is_search_regex)
|
||
|
$query_params['complex_body_' . $count++] = $word_boundary_wrapper($escape_sql_regex($regularWord));
|
||
|
else
|
||
|
$query_params['complex_body_' . $count++] = '%' . $smcFunc['db_escape_wildcard_string']($regularWord) . '%';
|
||
|
}
|
||
|
|
||
|
if ($query_params['user_query'])
|
||
|
$query_where[] = '{raw:user_query}';
|
||
|
if ($query_params['board_query'])
|
||
|
$query_where[] = 'm.id_board {raw:board_query}';
|
||
|
|
||
|
if ($query_params['topic'])
|
||
|
$query_where[] = 'm.id_topic = {int:topic}';
|
||
|
if ($query_params['min_msg_id'])
|
||
|
$query_where[] = 'm.id_msg >= {int:min_msg_id}';
|
||
|
if ($query_params['max_msg_id'])
|
||
|
$query_where[] = 'm.id_msg <= {int:max_msg_id}';
|
||
|
|
||
|
$count = 0;
|
||
|
if (!empty($query_params['excluded_phrases']) && empty($modSettings['search_force_index']))
|
||
|
foreach ($query_params['excluded_phrases'] as $phrase)
|
||
|
{
|
||
|
$query_where[] = 'subject NOT ' . $query_match_type . ' {string:exclude_subject_words_' . $count . '}';
|
||
|
|
||
|
if ($is_search_regex)
|
||
|
$query_params['exclude_subject_words_' . $count++] = $word_boundary_wrapper($escape_sql_regex($excludedWord));
|
||
|
else
|
||
|
$query_params['exclude_subject_words_' . $count++] = '%' . $smcFunc['db_escape_wildcard_string']($excludedWord) . '%';
|
||
|
}
|
||
|
$count = 0;
|
||
|
if (!empty($query_params['excluded_subject_words']) && empty($modSettings['search_force_index']))
|
||
|
foreach ($query_params['excluded_subject_words'] as $excludedWord)
|
||
|
{
|
||
|
$query_where[] = 'subject NOT ' . $query_match_type . ' {string:exclude_subject_words_' . $count . '}';
|
||
|
|
||
|
if ($is_search_regex)
|
||
|
$query_params['exclude_subject_words_' . $count++] = $word_boundary_wrapper($escape_sql_regex($excludedWord));
|
||
|
else
|
||
|
$query_params['exclude_subject_words_' . $count++] = '%' . $smcFunc['db_escape_wildcard_string']($excludedWord) . '%';
|
||
|
}
|
||
|
|
||
|
$numTables = 0;
|
||
|
$prev_join = 0;
|
||
|
foreach ($words['indexed_words'] as $indexedWord)
|
||
|
{
|
||
|
$numTables++;
|
||
|
if (in_array($indexedWord, $query_params['excluded_index_words']))
|
||
|
{
|
||
|
$query_left_join[] = '{db_prefix}log_search_words AS lsw' . $numTables . ' ON (lsw' . $numTables . '.id_word = ' . $indexedWord . ' AND lsw' . $numTables . '.id_msg = m.id_msg)';
|
||
|
$query_where[] = '(lsw' . $numTables . '.id_word IS NULL)';
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
$query_inner_join[] = '{db_prefix}log_search_words AS lsw' . $numTables . ' ON (lsw' . $numTables . '.id_msg = ' . ($prev_join === 0 ? 'm' : 'lsw' . $prev_join) . '.id_msg)';
|
||
|
$query_where[] = 'lsw' . $numTables . '.id_word = ' . $indexedWord;
|
||
|
$prev_join = $numTables;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
$ignoreRequest = $smcFunc['db_search_query']('insert_into_log_messages_fulltext', ($smcFunc['db_support_ignore'] ? ('
|
||
|
INSERT IGNORE INTO {db_prefix}' . $search_data['insert_into'] . '
|
||
|
(' . implode(', ', array_keys($query_select)) . ')') : '') . '
|
||
|
SELECT ' . implode(', ', $query_select) . '
|
||
|
FROM {db_prefix}messages AS m' . (empty($query_inner_join) ? '' : '
|
||
|
INNER JOIN ' . implode('
|
||
|
INNER JOIN ', $query_inner_join)) . (empty($query_left_join) ? '' : '
|
||
|
LEFT JOIN ' . implode('
|
||
|
LEFT JOIN ', $query_left_join)) . '
|
||
|
WHERE ' . implode('
|
||
|
AND ', $query_where) . (empty($search_data['max_results']) ? '' : '
|
||
|
LIMIT ' . ($search_data['max_results'] - $search_data['indexed_results'])),
|
||
|
$query_params
|
||
|
);
|
||
|
|
||
|
return $ignoreRequest;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* {@inheritDoc}
|
||
|
*/
|
||
|
public function postCreated(array &$msgOptions, array &$topicOptions, array &$posterOptions)
|
||
|
{
|
||
|
global $modSettings, $smcFunc;
|
||
|
|
||
|
$customIndexSettings = $smcFunc['json_decode']($modSettings['search_custom_index_config'], true);
|
||
|
|
||
|
$inserts = array();
|
||
|
foreach (text2words($msgOptions['body'], $customIndexSettings['bytes_per_word'], true) as $word)
|
||
|
$inserts[] = array($word, $msgOptions['id']);
|
||
|
|
||
|
if (!empty($inserts))
|
||
|
$smcFunc['db_insert']('ignore',
|
||
|
'{db_prefix}log_search_words',
|
||
|
array('id_word' => 'int', 'id_msg' => 'int'),
|
||
|
$inserts,
|
||
|
array('id_word', 'id_msg')
|
||
|
);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* {@inheritDoc}
|
||
|
*/
|
||
|
public function postModified(array &$msgOptions, array &$topicOptions, array &$posterOptions)
|
||
|
{
|
||
|
global $modSettings, $smcFunc;
|
||
|
|
||
|
if (isset($msgOptions['body']))
|
||
|
{
|
||
|
$customIndexSettings = $smcFunc['json_decode']($modSettings['search_custom_index_config'], true);
|
||
|
$stopwords = empty($modSettings['search_stopwords']) ? array() : explode(',', $modSettings['search_stopwords']);
|
||
|
$old_body = isset($msgOptions['old_body']) ? $msgOptions['old_body'] : '';
|
||
|
|
||
|
// create thew new and old index
|
||
|
$old_index = text2words($old_body, $customIndexSettings['bytes_per_word'], true);
|
||
|
$new_index = text2words($msgOptions['body'], $customIndexSettings['bytes_per_word'], true);
|
||
|
|
||
|
// Calculate the words to be added and removed from the index.
|
||
|
$removed_words = array_diff(array_diff($old_index, $new_index), $stopwords);
|
||
|
$inserted_words = array_diff(array_diff($new_index, $old_index), $stopwords);
|
||
|
|
||
|
// Delete the removed words AND the added ones to avoid key constraints.
|
||
|
if (!empty($removed_words))
|
||
|
{
|
||
|
$removed_words = array_merge($removed_words, $inserted_words);
|
||
|
$smcFunc['db_query']('', '
|
||
|
DELETE FROM {db_prefix}log_search_words
|
||
|
WHERE id_msg = {int:id_msg}
|
||
|
AND id_word IN ({array_int:removed_words})',
|
||
|
array(
|
||
|
'removed_words' => $removed_words,
|
||
|
'id_msg' => $msgOptions['id'],
|
||
|
)
|
||
|
);
|
||
|
}
|
||
|
|
||
|
// Add the new words to be indexed.
|
||
|
if (!empty($inserted_words))
|
||
|
{
|
||
|
$inserts = array();
|
||
|
foreach ($inserted_words as $word)
|
||
|
$inserts[] = array($word, $msgOptions['id']);
|
||
|
$smcFunc['db_insert']('insert',
|
||
|
'{db_prefix}log_search_words',
|
||
|
array('id_word' => 'string', 'id_msg' => 'int'),
|
||
|
$inserts,
|
||
|
array('id_word', 'id_msg')
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
?>
|