diff --git a/functions.php b/functions.php index fa9cd0e..cb9e007 100644 --- a/functions.php +++ b/functions.php @@ -308,15 +308,20 @@ function pcre_check_error(mixed &$result, bool $no_error = false): bool { return true; } -/** - * @param string $s - * @param string|string[]|null $keywords - * @return string - */ -function hl_matched(string $s, string|array|null $keywords = []): string { +function hl_matched(string $s, string|Stringable|SkinString|array|null $keywords = []): string { if (is_null($keywords)) return htmlescape($s); + if ($keywords instanceof Stringable) + $keywords = $keywords->__toString(); + else if (is_array($keywords)) { + $keywords = array_map(function($s) { + if ($s instanceof Stringable) + return $s->__toString(); + return $s; + }, $keywords); + } + if (is_string($keywords)) $keywords = preg_split('/\s+/', $keywords); diff --git a/handler/FilesHandler.php b/handler/FilesHandler.php index 337c835..e570734 100644 --- a/handler/FilesHandler.php +++ b/handler/FilesHandler.php @@ -115,8 +115,55 @@ class FilesHandler extends request_handler { break; case FilesCollection::MercureDeFrance: - $files = mdf_get(); - set_title('$files_mdf_collection'); + if ($query !== null) { + $files = mdf_search($query, $offset, self::SEARCH_RESULTS_PER_PAGE); + $vars += [ + 'search_count' => $files['count'], + 'search_query' => $query + ]; + + /** @var MDFCollectionItem[] $files */ + $files = $files['items']; + + $query_words = array_map('mb_strtolower', preg_split('/\s+/', $query)); + $found = []; + $result_ids = []; + foreach ($files as $file) { + $result_ids[] = $file->id; + + foreach ([ + $file->date, + (string)$file->issue + ] as $haystack) { + foreach ($query_words as $qw) { + if (mb_strpos($haystack, $qw) !== false) { + $found[$file->id] = true; + continue 2; + } + } + } + } + + $found = array_map('intval', array_keys($found)); + $not_found = array_diff($result_ids, $found); + if (!empty($not_found)) + $text_excerpts = mdf_get_text_excerpts($not_found, $query_words); + + if (is_xhr_request()) { + ajax_ok([ + ...$vars, + 'new_offset' => $offset + count($files), + 'html' => skin('files')->collection_files($files, $query, $text_excerpts) + ]); + } + } else { + $files = mdf_get(); + } + + $title = lang('files_mdf_collection'); + if ($query) + $title .= ' - '.htmlescape($query); + set_title($title); break; } diff --git a/lib/files.php b/lib/files.php index 39716e5..9a47cc0 100644 --- a/lib/files.php +++ b/lib/files.php @@ -3,7 +3,7 @@ require_once 'engine/sphinx.php'; const WFF_ARCHIVE_SPHINX_RTINDEX = 'wff_collection'; -//const MDF_ARCHIVE_SPHINX/**/_RTINDEX = 'mdf_archive'; +const MDF_ARCHIVE_SPHINX_RTINDEX = 'mdf_archive'; //const BACONIANA_ARCHIVE_SPHINX_RTINDEX = 'baconiana_archive'; enum FilesCollection: string { @@ -12,7 +12,7 @@ enum FilesCollection: string { case Baconiana = 'baconiana'; public function isSearchSupported(): bool { - return $this == FilesCollection::WilliamFriedman; + return $this == FilesCollection::WilliamFriedman || $this == FilesCollection::MercureDeFrance; } } @@ -161,13 +161,13 @@ class MDFCollectionItem extends model implements FilesItemInterface { return "№{$this->issue}, {$this->getHumanFriendlyDate()}"; } - protected function getHumanFriendlyDate(): string { + public function getHumanFriendlyDate(): string { $dt = new DateTime($this->date); return $dt->format('j M Y'); } public function isTargetBlank(): bool { return true; } - public function getId(): string { return (string)$this->issue; } + public function getId(): string { return $this->id; } public function getUrl(): string { global $config; return 'https://'.$config['files_domain'].'/Mercure-de-France-OCR/'.$this->path; @@ -339,9 +339,7 @@ function wff_get_by_id(array $ids): array { } function wff_search(string $q, int $offset = 0, int $count = 0): array { - $query_filtered = sphinx_mkquery($q, [ - 'star' => false, - ]); + $query_filtered = sphinx_mkquery($q); $cl = sphinx_client(); $cl->setLimits($offset, $count); @@ -391,17 +389,24 @@ function wff_reindex(): void { } } +function mdf_get_text_excerpts(array $ids, array $keywords, int $before = 50, int $after = 40): array { + return _get_text_excerpts('mdf_texts', 'mdf_id', $ids, $keywords, $before, $after); +} + +function wff_get_text_excerpts(array $ids, array $keywords, int $before = 50, int $after = 40): array { + return _get_text_excerpts('wff_texts', 'wff_id', $ids, $keywords, $before, $after); +} + /** + * @param string $table + * @param string $field_id * @param int[] $ids * @param string[] $keywords Must already be lower-cased * @param int $before * @param int $after * @return array */ -function wff_get_text_excerpts(array $ids, - array $keywords, - int $before = 50, - int $after = 40): array { +function _get_text_excerpts(string $table, string $field_id, array $ids, array $keywords, int $before, int $after) { $results = []; foreach ($ids as $id) $results[$id] = null; @@ -411,7 +416,7 @@ function wff_get_text_excerpts(array $ids, $dynamic_sql_parts = []; $combined_parts = []; foreach ($keywords as $keyword) { - $part = "LOCATE('".$db->escape($keyword)."', LOWER(text))"; + $part = "LOCATE('".$db->escape($keyword)."', text)"; $dynamic_sql_parts[] = $part; } if (count($dynamic_sql_parts) > 1) { @@ -425,7 +430,7 @@ function wff_get_text_excerpts(array $ids, $total = $before + $after; $sql = "SELECT - wff_id AS id, + {$field_id} AS id, GREATEST( 1, {$combined_parts} - {$before} @@ -442,9 +447,9 @@ function wff_get_text_excerpts(array $ids, ) ) AS excerpt FROM - wff_texts + {$table} WHERE - wff_id IN (".implode(',', $ids).")"; + {$field_id} IN (".implode(',', $ids).")"; $q = $db->query($sql); while ($row = $db->fetch($q)) { @@ -466,6 +471,63 @@ function mdf_get(): array { return array_map('MDFCollectionItem::create_instance', $db->fetchAll($q)); } +/** + * @param int[] $ids + * @return MDFCollectionItem[] + */ +function mdf_get_by_id(array $ids): array { + $db = DB(); + $q = $db->query("SELECT * FROM mdf_collection WHERE id IN (".implode(',', $ids).")"); + return array_map('MDFCollectionItem::create_instance', $db->fetchAll($q)); +} + +function mdf_search(string $q, int $offset = 0, int $count = 0): array { + $query_filtered = sphinx_mkquery($q); + + $cl = sphinx_client(); + $cl->setLimits($offset, $count); + $cl->setMatchMode(Sphinx\SphinxClient::SPH_MATCH_EXTENDED); + $cl->setFieldWeights([ + 'date' => 10, + 'issue' => 9, + 'text' => 8 + ]); + + $cl->setRankingMode(Sphinx\SphinxClient::SPH_RANK_PROXIMITY_BM25); + $cl->setSortMode(Sphinx\SphinxClient::SPH_SORT_RELEVANCE); + + // run search + $final_query = "$query_filtered"; + $result = $cl->query($final_query, MDF_ARCHIVE_SPHINX_RTINDEX); + $error = $cl->getLastError(); + $warning = $cl->getLastWarning(); + if ($error) + logError(__FUNCTION__, $error); + if ($warning) + logWarning(__FUNCTION__, $warning); + if ($result === false) + return ['count' => 0, 'items' => []]; + + $total_found = (int)$result['total_found']; + + $items = []; + if (!empty($result['matches'])) + $items = mdf_get_by_id(array_keys($result['matches'])); + + return ['count' => $total_found, 'items' => $items]; +} + +function mdf_reindex(): void { + sphinx_execute("TRUNCATE RTINDEX ".MDF_ARCHIVE_SPHINX_RTINDEX); + $db = DB(); + $mdf = mdf_get(); + foreach ($mdf as $item) { + $text = $db->result($db->query("SELECT text FROM mdf_texts WHERE mdf_id=?", $item->id)); + sphinx_execute("INSERT INTO ".MDF_ARCHIVE_SPHINX_RTINDEX." (id, volume, issue, date, text) VALUES (?, ?, ?, ?, ?)", + $item->id, $item->volume, (string)$item->issue, $item->getHumanFriendlyDate(), $text); + } +} + /** * @return BookItem[] */ diff --git a/skin/base.phps b/skin/base.phps index b59a2f1..bad2f9f 100644 --- a/skin/base.phps +++ b/skin/base.phps @@ -58,7 +58,7 @@ return << - +{$ctx->if_admin(fn() => "")} HTML; } diff --git a/skin/files.phps b/skin/files.phps index 0c0b618..dc2a1f0 100644 --- a/skin/files.phps +++ b/skin/files.phps @@ -187,7 +187,7 @@ if ($disabled) $mapper = function($s) use ($unsafe_query) { if ($unsafe_query !== null) { - return hl_matched($s, [$unsafe_query]); + return hl_matched($s, $unsafe_query); } else { return htmlescape($s); }