526 lines
15 KiB
PHP
526 lines
15 KiB
PHP
<?php
|
|
|
|
require_once 'engine/sphinx.php';
|
|
|
|
const WFF_ARCHIVE_SPHINX_RTINDEX = 'wff_collection';
|
|
//const MDF_ARCHIVE_SPHINX/**/_RTINDEX = 'mdf_archive';
|
|
//const BACONIANA_ARCHIVE_SPHINX_RTINDEX = 'baconiana_archive';
|
|
|
|
enum FilesCollection: string {
|
|
case WilliamFriedman = 'wff';
|
|
case MercureDeFrance = 'mdf';
|
|
case Baconiana = 'baconiana';
|
|
|
|
public function isSearchSupported(): bool {
|
|
return $this == FilesCollection::WilliamFriedman;
|
|
}
|
|
}
|
|
|
|
enum FilesItemType: string {
|
|
case FILE = 'file';
|
|
case FOLDER = 'folder';
|
|
}
|
|
|
|
enum BookFileType: string {
|
|
case NONE = 'none';
|
|
case BOOK = 'book';
|
|
case ARTICLE = 'article';
|
|
}
|
|
|
|
enum BookCategory: string {
|
|
case BOOKS = 'books';
|
|
case MISC = 'misc';
|
|
}
|
|
|
|
interface FilesItemInterface {
|
|
public function getId(): string;
|
|
public function isFolder(): bool;
|
|
public function isFile(): bool;
|
|
public function getUrl(): string;
|
|
public function getSize(): ?int;
|
|
public function getTitle(): string;
|
|
public function getTitleHtml(): ?string;
|
|
public function getMeta(?string $hl_matched = null): array;
|
|
public function isAvailable(): bool;
|
|
public function isTargetBlank(): bool;
|
|
public function getSubtitle(): ?string;
|
|
}
|
|
|
|
trait FilesItemTypeTrait {
|
|
public FilesItemType $type;
|
|
public function isFolder(): bool { return $this->type == FilesItemType::FOLDER; }
|
|
public function isFile(): bool { return $this->type == FilesItemType::FILE; }
|
|
}
|
|
|
|
trait FilesItemSizeTrait {
|
|
public int $size;
|
|
public function getSize(): ?int { return $this->isFile() ? $this->size : null; }
|
|
}
|
|
|
|
class CollectionItem implements FilesItemInterface {
|
|
|
|
public function __construct(
|
|
protected FilesCollection $collection
|
|
) {}
|
|
|
|
public function getTitleHtml(): ?string { return null; }
|
|
public function getId(): string { return $this->collection->value; }
|
|
public function isFolder(): bool { return true; }
|
|
public function isFile(): bool { return false; }
|
|
public function isAvailable(): bool { return true; }
|
|
public function getUrl(): string {
|
|
global $config;
|
|
switch ($this->collection) {
|
|
case FilesCollection::MercureDeFrance:
|
|
case FilesCollection::WilliamFriedman:
|
|
return '/files/'.$this->collection->value.'/';
|
|
case FilesCollection::Baconiana:
|
|
return 'https://'.$config['files_domain'].'/Baconiana/';
|
|
}
|
|
}
|
|
public function getSize(): ?int { return null; }
|
|
public function getTitle(): string { return lang("files_{$this->collection->value}_collection"); }
|
|
public function getMeta(?string $hl_matched = null): array { return []; }
|
|
public function isTargetBlank(): bool { return $this->collection === FilesCollection::Baconiana; }
|
|
public function getSubtitle(): ?string { return null; }
|
|
}
|
|
|
|
class WFFCollectionItem extends model implements FilesItemInterface {
|
|
|
|
const DB_TABLE = 'wff_collection';
|
|
|
|
use FilesItemTypeTrait;
|
|
use FilesItemSizeTrait;
|
|
|
|
public int $id;
|
|
public int $parentId;
|
|
public string $title;
|
|
public string $documentId;
|
|
public string $path;
|
|
public int $filesCount;
|
|
|
|
public function getTitleHtml(): ?string { return null; }
|
|
public function getId(): string { return (string)$this->id; }
|
|
public function isAvailable(): bool { return true; }
|
|
public function getTitle(): string { return $this->title; }
|
|
public function getDocumentId(): string { return $this->isFolder() ? str_replace('_', ' ', basename($this->path)) : $this->documentId; }
|
|
public function isTargetBlank(): bool { return $this->isFile(); }
|
|
public function getSubtitle(): ?string { return null; }
|
|
|
|
public function getUrl(): string {
|
|
global $config;
|
|
return $this->isFolder()
|
|
? "/files/wff/{$this->id}/"
|
|
: "https://{$config['files_domain']}/NSA Friedman Documents/{$this->path}";
|
|
}
|
|
|
|
public function getMeta(?string $hl_matched = null): array {
|
|
if ($this->isFolder()) {
|
|
if (!$this->parentId)
|
|
return [];
|
|
return [
|
|
'items' => [
|
|
hl_matched($this->getDocumentId(), $hl_matched),
|
|
lang_num('files_count', $this->filesCount)
|
|
]
|
|
];
|
|
}
|
|
return [
|
|
'inline' => false,
|
|
'items' => [
|
|
hl_matched('Document '.$this->documentId),
|
|
sizeString($this->size),
|
|
'PDF'
|
|
]
|
|
];
|
|
}
|
|
|
|
}
|
|
|
|
class MDFCollectionItem extends model implements FilesItemInterface {
|
|
|
|
const DB_TABLE = 'mdf_collection';
|
|
|
|
use FilesItemTypeTrait;
|
|
use FilesItemSizeTrait;
|
|
|
|
public int $id;
|
|
public int $issue;
|
|
public string $path;
|
|
public string $date;
|
|
public int $volume;
|
|
public int $pageFrom;
|
|
public int $pageTo;
|
|
public int $pdfPages;
|
|
public int $size;
|
|
|
|
public function isAvailable(): bool { return true; }
|
|
|
|
public function getTitleHtml(): ?string { return null; }
|
|
|
|
public function getTitle(): string {
|
|
return "№{$this->issue}, {$this->getHumanFriendlyDate()}";
|
|
}
|
|
|
|
protected function getHumanFriendlyDate(): string {
|
|
$dt = new DateTime($this->date);
|
|
return $dt->format('j M Y');
|
|
}
|
|
|
|
public function isTargetBlank(): bool { return true; }
|
|
public function getId(): string { return (string)$this->issue; }
|
|
public function getUrl(): string {
|
|
global $config;
|
|
return 'https://'.$config['files_domain'].'/Mercure-de-France-OCR/'.$this->path;
|
|
}
|
|
|
|
public function getMeta(?string $hl_matched = null): array {
|
|
return [
|
|
'inline' => true,
|
|
'items' => [
|
|
'Vol. '.$this->getRomanVolume(),
|
|
'pp. '.$this->pageFrom.'-'.$this->pageTo,
|
|
sizeString($this->size),
|
|
'PDF'
|
|
]
|
|
];
|
|
}
|
|
|
|
public function getRomanVolume(): string {
|
|
return _arabic_to_roman($this->volume);
|
|
}
|
|
|
|
public function getSubtitle(): ?string {
|
|
return null;
|
|
//return 'Vol. '.$this->getRomanVolume().', pp. '.$this->pageFrom.'-'.$this->pageTo;
|
|
}
|
|
}
|
|
|
|
class BookItem extends model implements FilesItemInterface {
|
|
|
|
const DB_TABLE = 'books';
|
|
|
|
public int $id;
|
|
public int $parentId;
|
|
public string $author;
|
|
public string $title;
|
|
public int $year;
|
|
public int $size;
|
|
public FilesItemType $type;
|
|
public BookFileType $fileType;
|
|
public string $path;
|
|
public bool $external;
|
|
|
|
use FilesItemSizeTrait;
|
|
use FilesItemTypeTrait;
|
|
|
|
public function getId(): string {
|
|
return $this->id;
|
|
}
|
|
|
|
public function getUrl(): string {
|
|
if ($this->isFolder() && !$this->external)
|
|
return '/files/'.$this->id.'/';
|
|
global $config;
|
|
$buf = 'https://'.$config['files_domain'];
|
|
if (!str_starts_with($this->path, '/'))
|
|
$buf .= '/';
|
|
$buf .= $this->path;
|
|
return $buf;
|
|
}
|
|
|
|
public function getTitleHtml(): ?string {
|
|
if ($this->isFolder() || !$this->author)
|
|
return null;
|
|
$buf = '<b class="is-author">'.htmlescape($this->author).'</b><span class="is-title">';
|
|
if (!str_ends_with($this->author, '.'))
|
|
$buf .= '.';
|
|
$buf .= ' '.htmlescape($this->title).'</span>';
|
|
return $buf;
|
|
}
|
|
|
|
public function getTitle(): string {
|
|
return $this->title;
|
|
}
|
|
|
|
public function getMeta(?string $hl_matched = null): array {
|
|
if ($this->isFolder())
|
|
return [];
|
|
|
|
$items = [
|
|
sizeString($this->size),
|
|
strtoupper($this->getExtension())
|
|
];
|
|
|
|
return [
|
|
'inline' => false,
|
|
'items' => $items
|
|
];
|
|
}
|
|
|
|
protected function getExtension(): string {
|
|
return extension(basename($this->path));
|
|
}
|
|
|
|
public function isAvailable(): bool {
|
|
return true;
|
|
}
|
|
|
|
public function isTargetBlank(): bool {
|
|
return $this->isFile() || $this->external;
|
|
}
|
|
|
|
public function getSubtitle(): ?string {
|
|
if (!$this->year)
|
|
return null;
|
|
return '('.$this->year.')';
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @param int $folder_id
|
|
* @param bool $with_parents
|
|
* @return WFFCollectionItem|WFFCollectionItem[]|null
|
|
*/
|
|
function wff_get_folder(int $folder_id, bool $with_parents = false): WFFCollectionItem|array|null {
|
|
$db = DB();
|
|
$q = $db->query("SELECT * FROM wff_collection WHERE id=?", $folder_id);
|
|
if (!$db->numRows($q))
|
|
return null;
|
|
$item = new WFFCollectionItem($db->fetch($q));
|
|
if (!$item->isFolder())
|
|
return null;
|
|
if ($with_parents) {
|
|
$items = [$item];
|
|
if ($item->parentId) {
|
|
$parents = wff_get_folder($item->parentId, true);
|
|
if ($parents !== null)
|
|
$items = array_merge($items, $parents);
|
|
}
|
|
return $items;
|
|
}
|
|
return $item;
|
|
}
|
|
|
|
/**
|
|
* @param int|int[]|null $parent_id
|
|
* @return array
|
|
*/
|
|
function wff_get(int|array|null $parent_id = null) {
|
|
$db = DB();
|
|
|
|
$where = [];
|
|
$args = [];
|
|
|
|
if (!is_null($parent_id)) {
|
|
if (is_int($parent_id)) {
|
|
$where[] = "parent_id=?";
|
|
$args[] = $parent_id;
|
|
} else {
|
|
$where[] = "parent_id IN (".implode(", ", $parent_id).")";
|
|
}
|
|
}
|
|
$sql = "SELECT * FROM wff_collection";
|
|
if (!empty($where))
|
|
$sql .= " WHERE ".implode(" AND ", $where);
|
|
$sql .= " ORDER BY title";
|
|
$q = $db->query($sql, ...$args);
|
|
|
|
return array_map('WFFCollectionItem::create_instance', $db->fetchAll($q));
|
|
}
|
|
|
|
/**
|
|
* @param int[] $ids
|
|
* @return WFFCollectionItem[]
|
|
*/
|
|
function wff_get_by_id(array $ids): array {
|
|
$db = DB();
|
|
$q = $db->query("SELECT * FROM wff_collection WHERE id IN (".implode(',', $ids).")");
|
|
return array_map('WFFCollectionItem::create_instance', $db->fetchAll($q));
|
|
}
|
|
|
|
function wff_search(string $q, int $offset = 0, int $count = 0): array {
|
|
$query_filtered = sphinx_mkquery($q, [
|
|
'star' => false,
|
|
]);
|
|
|
|
$cl = sphinx_client();
|
|
$cl->setLimits($offset, $count);
|
|
$cl->setMatchMode(Sphinx\SphinxClient::SPH_MATCH_EXTENDED);
|
|
$cl->setFieldWeights([
|
|
'title' => 50,
|
|
'document_id' => 60,
|
|
]);
|
|
|
|
$cl->setRankingMode(Sphinx\SphinxClient::SPH_RANK_PROXIMITY_BM25);
|
|
$cl->setSortMode(Sphinx\SphinxClient::SPH_SORT_EXTENDED, '@relevance DESC, is_folder DESC');
|
|
|
|
// run search
|
|
$final_query = "$query_filtered";
|
|
$result = $cl->query($final_query, WFF_ARCHIVE_SPHINX_RTINDEX);
|
|
$error = $cl->getLastError();
|
|
$warning = $cl->getLastWarning();
|
|
if ($error)
|
|
logError(__FUNCTION__, $error);
|
|
if ($warning)
|
|
logWarning(__FUNCTION__, $warning);
|
|
if ($result === false)
|
|
return ['count' => 0, 'items' => []];
|
|
|
|
$total_found = (int)$result['total_found'];
|
|
|
|
$items = [];
|
|
if (!empty($result['matches']))
|
|
$items = wff_get_by_id(array_keys($result['matches']));
|
|
|
|
return ['count' => $total_found, 'items' => $items];
|
|
}
|
|
|
|
function wff_reindex(): void {
|
|
sphinx_execute("TRUNCATE RTINDEX ".WFF_ARCHIVE_SPHINX_RTINDEX);
|
|
$db = DB();
|
|
$q = $db->query("SELECT * FROM wff_collection");
|
|
while ($row = $db->fetch($q)) {
|
|
$item = new WFFCollectionItem($row);
|
|
if ($item->isFile()) {
|
|
$txt = file_get_contents('/home/user/nsa/txt/'.str_replace('.pdf', '.txt', basename($item->path)));
|
|
} else {
|
|
$txt = '';
|
|
}
|
|
sphinx_execute("INSERT INTO ".WFF_ARCHIVE_SPHINX_RTINDEX." (id, document_id, title, text, is_folder, parent_id) VALUES (?, ?, ?, ?, ?, ?)",
|
|
$item->id, $item->getDocumentId(), $item->title, $txt, (int)$item->isFolder(), $item->parentId);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @param int[] $ids
|
|
* @param string[] $keywords Must already be lower-cased
|
|
* @param int $before
|
|
* @param int $after
|
|
* @return array
|
|
*/
|
|
function wff_get_text_excerpts(array $ids,
|
|
array $keywords,
|
|
int $before = 50,
|
|
int $after = 40): array {
|
|
$results = [];
|
|
foreach ($ids as $id)
|
|
$results[$id] = null;
|
|
|
|
$db = DB();
|
|
|
|
$dynamic_sql_parts = [];
|
|
$combined_parts = [];
|
|
foreach ($keywords as $keyword) {
|
|
$part = "LOCATE('".$db->escape($keyword)."', LOWER(text))";
|
|
$dynamic_sql_parts[] = $part;
|
|
}
|
|
if (count($dynamic_sql_parts) > 1) {
|
|
foreach ($dynamic_sql_parts as $part)
|
|
$combined_parts[] = "IF({$part} > 0, {$part}, CHAR_LENGTH(text) + 1)";
|
|
$combined_parts = implode(', ', $combined_parts);
|
|
$combined_parts = 'LEAST('.$combined_parts.')';
|
|
} else {
|
|
$combined_parts = "IF({$dynamic_sql_parts[0]} > 0, {$dynamic_sql_parts[0]}, CHAR_LENGTH(text) + 1)";
|
|
}
|
|
|
|
$total = $before + $after;
|
|
$sql = "SELECT
|
|
wff_id AS id,
|
|
GREATEST(
|
|
1,
|
|
{$combined_parts} - {$before}
|
|
) AS excerpt_start_index,
|
|
SUBSTRING(
|
|
text,
|
|
GREATEST(
|
|
1,
|
|
{$combined_parts} - {$before}
|
|
),
|
|
LEAST(
|
|
CHAR_LENGTH(text),
|
|
{$total} + {$combined_parts} - GREATEST(1, {$combined_parts} - {$before})
|
|
)
|
|
) AS excerpt
|
|
FROM
|
|
wff_texts
|
|
WHERE
|
|
wff_id IN (".implode(',', $ids).")";
|
|
|
|
$q = $db->query($sql);
|
|
while ($row = $db->fetch($q)) {
|
|
$results[$row['id']] = [
|
|
'excerpt' => preg_replace('/\s+/', ' ', $row['excerpt']),
|
|
'index' => (int)$row['excerpt_start_index']
|
|
];
|
|
}
|
|
|
|
return $results;
|
|
}
|
|
|
|
/**
|
|
* @return MDFCollectionItem[]
|
|
*/
|
|
function mdf_get(): array {
|
|
$db = DB();
|
|
$q = $db->query("SELECT * FROM mdf_collection ORDER BY `date`");
|
|
return array_map('MDFCollectionItem::create_instance', $db->fetchAll($q));
|
|
}
|
|
|
|
/**
|
|
* @return BookItem[]
|
|
*/
|
|
function books_get(int $parent_id = 0,
|
|
BookCategory $category = BookCategory::BOOKS): array {
|
|
$db = DB();
|
|
|
|
if ($category == BookCategory::BOOKS) {
|
|
$order_by = "type, ".($parent_id != 0 ? 'year, ': '')."author, title";
|
|
}
|
|
else
|
|
$order_by = "type, title";
|
|
|
|
$q = $db->query("SELECT * FROM books WHERE category=? AND parent_id=? ORDER BY $order_by",
|
|
$category->value, $parent_id);
|
|
return array_map('BookItem::create_instance', $db->fetchAll($q));
|
|
}
|
|
|
|
function books_get_folder(int $id): ?BookItem {
|
|
$db = DB();
|
|
$q = $db->query("SELECT * FROM books WHERE id=?", $id);
|
|
if (!$db->numRows($q))
|
|
return null;
|
|
$item = new BookItem($db->fetch($q));
|
|
if (!$item->isFolder())
|
|
return null;
|
|
return $item;
|
|
}
|
|
|
|
function _arabic_to_roman($number) {
|
|
$map = [
|
|
1000 => 'M',
|
|
900 => 'CM',
|
|
500 => 'D',
|
|
400 => 'CD',
|
|
100 => 'C',
|
|
90 => 'XC',
|
|
50 => 'L',
|
|
40 => 'XL',
|
|
10 => 'X',
|
|
9 => 'IX',
|
|
5 => 'V',
|
|
4 => 'IV',
|
|
1 => 'I',
|
|
];
|
|
$result = '';
|
|
|
|
foreach ($map as $arabic => $roman) {
|
|
while ($number >= $arabic) {
|
|
$result .= $roman;
|
|
$number -= $arabic;
|
|
}
|
|
}
|
|
|
|
return $result;
|
|
}
|