4in1_ws_web/lib/files.php
2024-03-13 17:19:27 +00:00

587 lines
17 KiB
PHP

<?php
require_once 'engine/sphinx.php';
const WFF_ARCHIVE_SPHINX_RTINDEX = 'wff_collection';
const MDF_ARCHIVE_SPHINX_RTINDEX = 'mdf_archive';
//const BACONIANA_ARCHIVE_SPHINX_RTINDEX = 'baconiana_archive';
enum FilesCollection: string {
case WilliamFriedman = 'wff';
case MercureDeFrance = 'mdf';
case Baconiana = 'baconiana';
public function isSearchSupported(): bool {
return $this == FilesCollection::WilliamFriedman || $this == FilesCollection::MercureDeFrance;
}
}
enum FilesItemType: string {
case FILE = 'file';
case FOLDER = 'folder';
}
enum BookFileType: string {
case NONE = 'none';
case BOOK = 'book';
case ARTICLE = 'article';
}
enum BookCategory: string {
case BOOKS = 'books';
case MISC = 'misc';
}
interface FilesItemInterface {
public function getId(): string;
public function isFolder(): bool;
public function isFile(): bool;
public function getUrl(): string;
public function getSize(): ?int;
public function getTitle(): string;
public function getTitleHtml(): ?string;
public function getMeta(?string $hl_matched = null): array;
public function isAvailable(): bool;
public function isTargetBlank(): bool;
public function getSubtitle(): ?string;
}
trait FilesItemTypeTrait {
public FilesItemType $type;
public function isFolder(): bool { return $this->type == FilesItemType::FOLDER; }
public function isFile(): bool { return $this->type == FilesItemType::FILE; }
}
trait FilesItemSizeTrait {
public int $size;
public function getSize(): ?int { return $this->isFile() ? $this->size : null; }
}
class CollectionItem implements FilesItemInterface {
public function __construct(
protected FilesCollection $collection
) {}
public function getTitleHtml(): ?string { return null; }
public function getId(): string { return $this->collection->value; }
public function isFolder(): bool { return true; }
public function isFile(): bool { return false; }
public function isAvailable(): bool { return true; }
public function getUrl(): string {
global $config;
switch ($this->collection) {
case FilesCollection::Baconiana:
return 'https://'.$config['files_domain'].'/Baconiana/';
default:
return '/files/'.$this->collection->value.'/';
}
}
public function getSize(): ?int { return null; }
public function getTitle(): string { return lang("files_{$this->collection->value}_collection"); }
public function getMeta(?string $hl_matched = null): array { return []; }
public function isTargetBlank(): bool { return $this->collection === FilesCollection::Baconiana; }
public function getSubtitle(): ?string { return null; }
}
class WFFCollectionItem extends model implements FilesItemInterface {
const DB_TABLE = 'wff_collection';
use FilesItemTypeTrait;
use FilesItemSizeTrait;
public int $id;
public int $parentId;
public string $title;
public string $documentId;
public string $path;
public int $filesCount;
public function getTitleHtml(): ?string { return null; }
public function getId(): string { return (string)$this->id; }
public function isAvailable(): bool { return true; }
public function getTitle(): string { return $this->title; }
public function getDocumentId(): string { return $this->isFolder() ? str_replace('_', ' ', basename($this->path)) : $this->documentId; }
public function isTargetBlank(): bool { return $this->isFile(); }
public function getSubtitle(): ?string { return null; }
public function getUrl(): string {
global $config;
return $this->isFolder()
? "/files/wff/{$this->id}/"
: "https://{$config['files_domain']}/NSA Friedman Documents/{$this->path}";
}
public function getMeta(?string $hl_matched = null): array {
if ($this->isFolder()) {
if (!$this->parentId)
return [];
return [
'items' => [
hl_matched($this->getDocumentId(), $hl_matched),
lang_num('files_count', $this->filesCount)
]
];
}
return [
'inline' => false,
'items' => [
hl_matched('Document '.$this->documentId),
sizeString($this->size),
'PDF'
]
];
}
}
class MDFCollectionItem extends model implements FilesItemInterface {
const DB_TABLE = 'mdf_collection';
use FilesItemTypeTrait;
use FilesItemSizeTrait;
public int $id;
public int $issue;
public string $path;
public string $date;
public int $volume;
public int $pageFrom;
public int $pageTo;
public int $pdfPages;
public int $size;
public function isAvailable(): bool { return true; }
public function getTitleHtml(): ?string { return null; }
public function getTitle(): string {
return "{$this->issue}, {$this->getHumanFriendlyDate()}";
}
public function getHumanFriendlyDate(): string {
$dt = new DateTime($this->date);
return $dt->format('j M Y');
}
public function isTargetBlank(): bool { return true; }
public function getId(): string { return $this->id; }
public function getUrl(): string {
global $config;
return 'https://'.$config['files_domain'].'/Mercure-de-France-OCR/'.$this->path;
}
public function getMeta(?string $hl_matched = null): array {
return [
'inline' => true,
'items' => [
'Vol. '.$this->getRomanVolume(),
'pp. '.$this->pageFrom.'-'.$this->pageTo,
sizeString($this->size),
'PDF'
]
];
}
public function getRomanVolume(): string {
return _arabic_to_roman($this->volume);
}
public function getSubtitle(): ?string {
return null;
//return 'Vol. '.$this->getRomanVolume().', pp. '.$this->pageFrom.'-'.$this->pageTo;
}
}
class BookItem extends model implements FilesItemInterface {
const DB_TABLE = 'books';
public int $id;
public int $parentId;
public string $author;
public string $title;
public int $year;
public int $size;
public FilesItemType $type;
public BookFileType $fileType;
public string $path;
public bool $external;
use FilesItemSizeTrait;
use FilesItemTypeTrait;
public function getId(): string {
return $this->id;
}
public function getUrl(): string {
if ($this->isFolder() && !$this->external)
return '/files/'.$this->id.'/';
global $config;
$buf = 'https://'.$config['files_domain'];
if (!str_starts_with($this->path, '/'))
$buf .= '/';
$buf .= $this->path;
return $buf;
}
public function getTitleHtml(): ?string {
if ($this->isFolder() || !$this->author)
return null;
$buf = '<b class="is-author">'.htmlescape($this->author).'</b><span class="is-title">';
if (!str_ends_with($this->author, '.'))
$buf .= '.';
$buf .= ' '.htmlescape($this->title).'</span>';
return $buf;
}
public function getTitle(): string {
return $this->title;
}
public function getMeta(?string $hl_matched = null): array {
if ($this->isFolder())
return [];
$items = [
sizeString($this->size),
strtoupper($this->getExtension())
];
return [
'inline' => false,
'items' => $items
];
}
protected function getExtension(): string {
return extension(basename($this->path));
}
public function isAvailable(): bool {
return true;
}
public function isTargetBlank(): bool {
return $this->isFile() || $this->external;
}
public function getSubtitle(): ?string {
if (!$this->year)
return null;
return '('.$this->year.')';
}
}
/**
* @param int $folder_id
* @param bool $with_parents
* @return WFFCollectionItem|WFFCollectionItem[]|null
*/
function wff_get_folder(int $folder_id, bool $with_parents = false): WFFCollectionItem|array|null {
$db = DB();
$q = $db->query("SELECT * FROM wff_collection WHERE id=?", $folder_id);
if (!$db->numRows($q))
return null;
$item = new WFFCollectionItem($db->fetch($q));
if (!$item->isFolder())
return null;
if ($with_parents) {
$items = [$item];
if ($item->parentId) {
$parents = wff_get_folder($item->parentId, true);
if ($parents !== null)
$items = array_merge($items, $parents);
}
return $items;
}
return $item;
}
/**
* @param int|int[]|null $parent_id
* @return array
*/
function wff_get(int|array|null $parent_id = null) {
$db = DB();
$where = [];
$args = [];
if (!is_null($parent_id)) {
if (is_int($parent_id)) {
$where[] = "parent_id=?";
$args[] = $parent_id;
} else {
$where[] = "parent_id IN (".implode(", ", $parent_id).")";
}
}
$sql = "SELECT * FROM wff_collection";
if (!empty($where))
$sql .= " WHERE ".implode(" AND ", $where);
$sql .= " ORDER BY title";
$q = $db->query($sql, ...$args);
return array_map('WFFCollectionItem::create_instance', $db->fetchAll($q));
}
/**
* @param int[] $ids
* @return WFFCollectionItem[]
*/
function wff_get_by_id(array $ids): array {
$db = DB();
$q = $db->query("SELECT * FROM wff_collection WHERE id IN (".implode(',', $ids).")");
return array_map('WFFCollectionItem::create_instance', $db->fetchAll($q));
}
function wff_search(string $q, int $offset = 0, int $count = 0): array {
$query_filtered = sphinx_mkquery($q);
$cl = sphinx_client();
$cl->setLimits($offset, $count);
$cl->setMatchMode(Sphinx\SphinxClient::SPH_MATCH_EXTENDED);
$cl->setFieldWeights([
'title' => 50,
'document_id' => 60,
]);
$cl->setRankingMode(Sphinx\SphinxClient::SPH_RANK_PROXIMITY_BM25);
$cl->setSortMode(Sphinx\SphinxClient::SPH_SORT_EXTENDED, '@relevance DESC, is_folder DESC');
// run search
$final_query = "$query_filtered";
$result = $cl->query($final_query, WFF_ARCHIVE_SPHINX_RTINDEX);
$error = $cl->getLastError();
$warning = $cl->getLastWarning();
if ($error)
logError(__FUNCTION__, $error);
if ($warning)
logWarning(__FUNCTION__, $warning);
if ($result === false)
return ['count' => 0, 'items' => []];
$total_found = (int)$result['total_found'];
$items = [];
if (!empty($result['matches']))
$items = wff_get_by_id(array_keys($result['matches']));
return ['count' => $total_found, 'items' => $items];
}
function wff_reindex(): void {
sphinx_execute("TRUNCATE RTINDEX ".WFF_ARCHIVE_SPHINX_RTINDEX);
$db = DB();
$q = $db->query("SELECT * FROM wff_collection");
while ($row = $db->fetch($q)) {
$item = new WFFCollectionItem($row);
if ($item->isFile()) {
$txt = file_get_contents('/home/user/nsa/txt/'.str_replace('.pdf', '.txt', basename($item->path)));
} else {
$txt = '';
}
sphinx_execute("INSERT INTO ".WFF_ARCHIVE_SPHINX_RTINDEX." (id, document_id, title, text, is_folder, parent_id) VALUES (?, ?, ?, ?, ?, ?)",
$item->id, $item->getDocumentId(), $item->title, $txt, (int)$item->isFolder(), $item->parentId);
}
}
function mdf_get_text_excerpts(array $ids, array $keywords, int $before = 50, int $after = 40): array {
return _get_text_excerpts('mdf_texts', 'mdf_id', $ids, $keywords, $before, $after);
}
function wff_get_text_excerpts(array $ids, array $keywords, int $before = 50, int $after = 40): array {
return _get_text_excerpts('wff_texts', 'wff_id', $ids, $keywords, $before, $after);
}
/**
* @param string $table
* @param string $field_id
* @param int[] $ids
* @param string[] $keywords Must already be lower-cased
* @param int $before
* @param int $after
* @return array
*/
function _get_text_excerpts(string $table, string $field_id, array $ids, array $keywords, int $before, int $after) {
$results = [];
foreach ($ids as $id)
$results[$id] = null;
$db = DB();
$dynamic_sql_parts = [];
$combined_parts = [];
foreach ($keywords as $keyword) {
$part = "LOCATE('".$db->escape($keyword)."', text)";
$dynamic_sql_parts[] = $part;
}
if (count($dynamic_sql_parts) > 1) {
foreach ($dynamic_sql_parts as $part)
$combined_parts[] = "IF({$part} > 0, {$part}, CHAR_LENGTH(text) + 1)";
$combined_parts = implode(', ', $combined_parts);
$combined_parts = 'LEAST('.$combined_parts.')';
} else {
$combined_parts = "IF({$dynamic_sql_parts[0]} > 0, {$dynamic_sql_parts[0]}, CHAR_LENGTH(text) + 1)";
}
$total = $before + $after;
$sql = "SELECT
{$field_id} AS id,
GREATEST(
1,
{$combined_parts} - {$before}
) AS excerpt_start_index,
SUBSTRING(
text,
GREATEST(
1,
{$combined_parts} - {$before}
),
LEAST(
CHAR_LENGTH(text),
{$total} + {$combined_parts} - GREATEST(1, {$combined_parts} - {$before})
)
) AS excerpt
FROM
{$table}
WHERE
{$field_id} IN (".implode(',', $ids).")";
$q = $db->query($sql);
while ($row = $db->fetch($q)) {
$results[$row['id']] = [
'excerpt' => preg_replace('/\s+/', ' ', $row['excerpt']),
'index' => (int)$row['excerpt_start_index']
];
}
return $results;
}
/**
* @return MDFCollectionItem[]
*/
function mdf_get(): array {
$db = DB();
$q = $db->query("SELECT * FROM mdf_collection ORDER BY `date`");
return array_map('MDFCollectionItem::create_instance', $db->fetchAll($q));
}
/**
* @param int[] $ids
* @return MDFCollectionItem[]
*/
function mdf_get_by_id(array $ids): array {
$db = DB();
$q = $db->query("SELECT * FROM mdf_collection WHERE id IN (".implode(',', $ids).")");
return array_map('MDFCollectionItem::create_instance', $db->fetchAll($q));
}
function mdf_search(string $q, int $offset = 0, int $count = 0): array {
$query_filtered = sphinx_mkquery($q);
$cl = sphinx_client();
$cl->setLimits($offset, $count);
$cl->setMatchMode(Sphinx\SphinxClient::SPH_MATCH_EXTENDED);
$cl->setFieldWeights([
'date' => 10,
'issue' => 9,
'text' => 8
]);
$cl->setRankingMode(Sphinx\SphinxClient::SPH_RANK_PROXIMITY_BM25);
$cl->setSortMode(Sphinx\SphinxClient::SPH_SORT_RELEVANCE);
// run search
$final_query = "$query_filtered";
$result = $cl->query($final_query, MDF_ARCHIVE_SPHINX_RTINDEX);
$error = $cl->getLastError();
$warning = $cl->getLastWarning();
if ($error)
logError(__FUNCTION__, $error);
if ($warning)
logWarning(__FUNCTION__, $warning);
if ($result === false)
return ['count' => 0, 'items' => []];
$total_found = (int)$result['total_found'];
$items = [];
if (!empty($result['matches']))
$items = mdf_get_by_id(array_keys($result['matches']));
return ['count' => $total_found, 'items' => $items];
}
function mdf_reindex(): void {
sphinx_execute("TRUNCATE RTINDEX ".MDF_ARCHIVE_SPHINX_RTINDEX);
$db = DB();
$mdf = mdf_get();
foreach ($mdf as $item) {
$text = $db->result($db->query("SELECT text FROM mdf_texts WHERE mdf_id=?", $item->id));
sphinx_execute("INSERT INTO ".MDF_ARCHIVE_SPHINX_RTINDEX." (id, volume, issue, date, text) VALUES (?, ?, ?, ?, ?)",
$item->id, $item->volume, (string)$item->issue, $item->getHumanFriendlyDate(), $text);
}
}
/**
* @return BookItem[]
*/
function books_get(int $parent_id = 0,
BookCategory $category = BookCategory::BOOKS): array {
$db = DB();
if ($category == BookCategory::BOOKS) {
$order_by = "type, ".($parent_id != 0 ? 'year, ': '')."author, title";
}
else
$order_by = "type, title";
$q = $db->query("SELECT * FROM books WHERE category=? AND parent_id=? ORDER BY $order_by",
$category->value, $parent_id);
return array_map('BookItem::create_instance', $db->fetchAll($q));
}
function books_get_folder(int $id): ?BookItem {
$db = DB();
$q = $db->query("SELECT * FROM books WHERE id=?", $id);
if (!$db->numRows($q))
return null;
$item = new BookItem($db->fetch($q));
if (!$item->isFolder())
return null;
return $item;
}
function _arabic_to_roman($number) {
$map = [
1000 => 'M',
900 => 'CM',
500 => 'D',
400 => 'CD',
100 => 'C',
90 => 'XC',
50 => 'L',
40 => 'XL',
10 => 'X',
9 => 'IX',
5 => 'V',
4 => 'IV',
1 => 'I',
];
$result = '';
foreach ($map as $arabic => $roman) {
while ($number >= $arabic) {
$result .= $roman;
$number -= $arabic;
}
}
return $result;
}