mdf: add search

This commit is contained in:
E. S. 2024-03-13 17:19:27 +00:00
parent d5c352db13
commit c542307f00
5 changed files with 139 additions and 25 deletions

View File

@ -308,15 +308,20 @@ function pcre_check_error(mixed &$result, bool $no_error = false): bool {
return true; return true;
} }
/** function hl_matched(string $s, string|Stringable|SkinString|array|null $keywords = []): string {
* @param string $s
* @param string|string[]|null $keywords
* @return string
*/
function hl_matched(string $s, string|array|null $keywords = []): string {
if (is_null($keywords)) if (is_null($keywords))
return htmlescape($s); return htmlescape($s);
if ($keywords instanceof Stringable)
$keywords = $keywords->__toString();
else if (is_array($keywords)) {
$keywords = array_map(function($s) {
if ($s instanceof Stringable)
return $s->__toString();
return $s;
}, $keywords);
}
if (is_string($keywords)) if (is_string($keywords))
$keywords = preg_split('/\s+/', $keywords); $keywords = preg_split('/\s+/', $keywords);

View File

@ -115,8 +115,55 @@ class FilesHandler extends request_handler {
break; break;
case FilesCollection::MercureDeFrance: case FilesCollection::MercureDeFrance:
if ($query !== null) {
$files = mdf_search($query, $offset, self::SEARCH_RESULTS_PER_PAGE);
$vars += [
'search_count' => $files['count'],
'search_query' => $query
];
/** @var MDFCollectionItem[] $files */
$files = $files['items'];
$query_words = array_map('mb_strtolower', preg_split('/\s+/', $query));
$found = [];
$result_ids = [];
foreach ($files as $file) {
$result_ids[] = $file->id;
foreach ([
$file->date,
(string)$file->issue
] as $haystack) {
foreach ($query_words as $qw) {
if (mb_strpos($haystack, $qw) !== false) {
$found[$file->id] = true;
continue 2;
}
}
}
}
$found = array_map('intval', array_keys($found));
$not_found = array_diff($result_ids, $found);
if (!empty($not_found))
$text_excerpts = mdf_get_text_excerpts($not_found, $query_words);
if (is_xhr_request()) {
ajax_ok([
...$vars,
'new_offset' => $offset + count($files),
'html' => skin('files')->collection_files($files, $query, $text_excerpts)
]);
}
} else {
$files = mdf_get(); $files = mdf_get();
set_title('$files_mdf_collection'); }
$title = lang('files_mdf_collection');
if ($query)
$title .= ' - '.htmlescape($query);
set_title($title);
break; break;
} }

View File

@ -3,7 +3,7 @@
require_once 'engine/sphinx.php'; require_once 'engine/sphinx.php';
const WFF_ARCHIVE_SPHINX_RTINDEX = 'wff_collection'; const WFF_ARCHIVE_SPHINX_RTINDEX = 'wff_collection';
//const MDF_ARCHIVE_SPHINX/**/_RTINDEX = 'mdf_archive'; const MDF_ARCHIVE_SPHINX_RTINDEX = 'mdf_archive';
//const BACONIANA_ARCHIVE_SPHINX_RTINDEX = 'baconiana_archive'; //const BACONIANA_ARCHIVE_SPHINX_RTINDEX = 'baconiana_archive';
enum FilesCollection: string { enum FilesCollection: string {
@ -12,7 +12,7 @@ enum FilesCollection: string {
case Baconiana = 'baconiana'; case Baconiana = 'baconiana';
public function isSearchSupported(): bool { public function isSearchSupported(): bool {
return $this == FilesCollection::WilliamFriedman; return $this == FilesCollection::WilliamFriedman || $this == FilesCollection::MercureDeFrance;
} }
} }
@ -161,13 +161,13 @@ class MDFCollectionItem extends model implements FilesItemInterface {
return "{$this->issue}, {$this->getHumanFriendlyDate()}"; return "{$this->issue}, {$this->getHumanFriendlyDate()}";
} }
protected function getHumanFriendlyDate(): string { public function getHumanFriendlyDate(): string {
$dt = new DateTime($this->date); $dt = new DateTime($this->date);
return $dt->format('j M Y'); return $dt->format('j M Y');
} }
public function isTargetBlank(): bool { return true; } public function isTargetBlank(): bool { return true; }
public function getId(): string { return (string)$this->issue; } public function getId(): string { return $this->id; }
public function getUrl(): string { public function getUrl(): string {
global $config; global $config;
return 'https://'.$config['files_domain'].'/Mercure-de-France-OCR/'.$this->path; return 'https://'.$config['files_domain'].'/Mercure-de-France-OCR/'.$this->path;
@ -339,9 +339,7 @@ function wff_get_by_id(array $ids): array {
} }
function wff_search(string $q, int $offset = 0, int $count = 0): array { function wff_search(string $q, int $offset = 0, int $count = 0): array {
$query_filtered = sphinx_mkquery($q, [ $query_filtered = sphinx_mkquery($q);
'star' => false,
]);
$cl = sphinx_client(); $cl = sphinx_client();
$cl->setLimits($offset, $count); $cl->setLimits($offset, $count);
@ -391,17 +389,24 @@ function wff_reindex(): void {
} }
} }
function mdf_get_text_excerpts(array $ids, array $keywords, int $before = 50, int $after = 40): array {
return _get_text_excerpts('mdf_texts', 'mdf_id', $ids, $keywords, $before, $after);
}
function wff_get_text_excerpts(array $ids, array $keywords, int $before = 50, int $after = 40): array {
return _get_text_excerpts('wff_texts', 'wff_id', $ids, $keywords, $before, $after);
}
/** /**
* @param string $table
* @param string $field_id
* @param int[] $ids * @param int[] $ids
* @param string[] $keywords Must already be lower-cased * @param string[] $keywords Must already be lower-cased
* @param int $before * @param int $before
* @param int $after * @param int $after
* @return array * @return array
*/ */
function wff_get_text_excerpts(array $ids, function _get_text_excerpts(string $table, string $field_id, array $ids, array $keywords, int $before, int $after) {
array $keywords,
int $before = 50,
int $after = 40): array {
$results = []; $results = [];
foreach ($ids as $id) foreach ($ids as $id)
$results[$id] = null; $results[$id] = null;
@ -411,7 +416,7 @@ function wff_get_text_excerpts(array $ids,
$dynamic_sql_parts = []; $dynamic_sql_parts = [];
$combined_parts = []; $combined_parts = [];
foreach ($keywords as $keyword) { foreach ($keywords as $keyword) {
$part = "LOCATE('".$db->escape($keyword)."', LOWER(text))"; $part = "LOCATE('".$db->escape($keyword)."', text)";
$dynamic_sql_parts[] = $part; $dynamic_sql_parts[] = $part;
} }
if (count($dynamic_sql_parts) > 1) { if (count($dynamic_sql_parts) > 1) {
@ -425,7 +430,7 @@ function wff_get_text_excerpts(array $ids,
$total = $before + $after; $total = $before + $after;
$sql = "SELECT $sql = "SELECT
wff_id AS id, {$field_id} AS id,
GREATEST( GREATEST(
1, 1,
{$combined_parts} - {$before} {$combined_parts} - {$before}
@ -442,9 +447,9 @@ function wff_get_text_excerpts(array $ids,
) )
) AS excerpt ) AS excerpt
FROM FROM
wff_texts {$table}
WHERE WHERE
wff_id IN (".implode(',', $ids).")"; {$field_id} IN (".implode(',', $ids).")";
$q = $db->query($sql); $q = $db->query($sql);
while ($row = $db->fetch($q)) { while ($row = $db->fetch($q)) {
@ -466,6 +471,63 @@ function mdf_get(): array {
return array_map('MDFCollectionItem::create_instance', $db->fetchAll($q)); return array_map('MDFCollectionItem::create_instance', $db->fetchAll($q));
} }
/**
* @param int[] $ids
* @return MDFCollectionItem[]
*/
function mdf_get_by_id(array $ids): array {
$db = DB();
$q = $db->query("SELECT * FROM mdf_collection WHERE id IN (".implode(',', $ids).")");
return array_map('MDFCollectionItem::create_instance', $db->fetchAll($q));
}
function mdf_search(string $q, int $offset = 0, int $count = 0): array {
$query_filtered = sphinx_mkquery($q);
$cl = sphinx_client();
$cl->setLimits($offset, $count);
$cl->setMatchMode(Sphinx\SphinxClient::SPH_MATCH_EXTENDED);
$cl->setFieldWeights([
'date' => 10,
'issue' => 9,
'text' => 8
]);
$cl->setRankingMode(Sphinx\SphinxClient::SPH_RANK_PROXIMITY_BM25);
$cl->setSortMode(Sphinx\SphinxClient::SPH_SORT_RELEVANCE);
// run search
$final_query = "$query_filtered";
$result = $cl->query($final_query, MDF_ARCHIVE_SPHINX_RTINDEX);
$error = $cl->getLastError();
$warning = $cl->getLastWarning();
if ($error)
logError(__FUNCTION__, $error);
if ($warning)
logWarning(__FUNCTION__, $warning);
if ($result === false)
return ['count' => 0, 'items' => []];
$total_found = (int)$result['total_found'];
$items = [];
if (!empty($result['matches']))
$items = mdf_get_by_id(array_keys($result['matches']));
return ['count' => $total_found, 'items' => $items];
}
function mdf_reindex(): void {
sphinx_execute("TRUNCATE RTINDEX ".MDF_ARCHIVE_SPHINX_RTINDEX);
$db = DB();
$mdf = mdf_get();
foreach ($mdf as $item) {
$text = $db->result($db->query("SELECT text FROM mdf_texts WHERE mdf_id=?", $item->id));
sphinx_execute("INSERT INTO ".MDF_ARCHIVE_SPHINX_RTINDEX." (id, volume, issue, date, text) VALUES (?, ?, ?, ?, ?)",
$item->id, $item->volume, (string)$item->issue, $item->getHumanFriendlyDate(), $text);
}
}
/** /**
* @return BookItem[] * @return BookItem[]
*/ */

View File

@ -58,7 +58,7 @@ return <<<HTML
<!-- /Yandex.Metrika counter --> <!-- /Yandex.Metrika counter -->
</body> </body>
</html> </html>
<!-- {$exec_time} s --> {$ctx->if_admin(fn() => "<!-- {$exec_time} s -->")}
HTML; HTML;
} }

View File

@ -187,7 +187,7 @@ if ($disabled)
$mapper = function($s) use ($unsafe_query) { $mapper = function($s) use ($unsafe_query) {
if ($unsafe_query !== null) { if ($unsafe_query !== null) {
return hl_matched($s, [$unsafe_query]); return hl_matched($s, $unsafe_query);
} else { } else {
return htmlescape($s); return htmlescape($s);
} }