00001 <?php
00002
00016
00017
00018 import('search.SearchFileParser');
00019 import('search.SearchHTMLParser');
00020 import('search.SearchHelperParser');
00021 import('search.PaperSearch');
00022
00023
00024 define('SEARCH_KEYWORD_MAX_LENGTH', 40);
00025
00026 class PaperSearchIndex {
00027
00034 function indexObjectKeywords($objectId, $text, &$position) {
00035 $searchDao =& DAORegistry::getDAO('PaperSearchDAO');
00036 $keywords =& PaperSearchIndex::filterKeywords($text);
00037 for ($i = 0, $count = count($keywords); $i < $count; $i++) {
00038 if ($searchDao->insertObjectKeyword($objectId, $keywords[$i], $position) !== null) {
00039 $position += 1;
00040 }
00041 }
00042 }
00043
00051 function updateTextIndex($paperId, $type, $text, $assocId = null) {
00052 $searchDao =& DAORegistry::getDAO('PaperSearchDAO');
00053 $objectId = $searchDao->insertObject($paperId, $type, $assocId);
00054 $position = 0;
00055 PaperSearchIndex::indexObjectKeywords($objectId, $text, $position);
00056 }
00057
00064 function updateFileIndex($paperId, $type, $fileId) {
00065 import('file.PaperFileManager');
00066 $fileMgr = new PaperFileManager($paperId);
00067 $file =& $fileMgr->getFile($fileId);
00068
00069 if (isset($file)) {
00070 $parser =& SearchFileParser::fromFile($file);
00071 }
00072
00073 if (isset($parser)) {
00074 if ($parser->open()) {
00075 $searchDao =& DAORegistry::getDAO('PaperSearchDAO');
00076 $objectId = $searchDao->insertObject($paperId, $type, $fileId);
00077
00078 $position = 0;
00079 while(($text = $parser->read()) !== false) {
00080 PaperSearchIndex::indexObjectKeywords($objectId, $text, $position);
00081 }
00082 $parser->close();
00083 }
00084 }
00085 }
00086
00093 function deleteTextIndex($paperId, $type = null, $assocId = null) {
00094 $searchDao =& DAORegistry::getDAO('PaperSearchDAO');
00095 return $searchDao->deletePaperKeywords($paperId, $type, $assocId);
00096 }
00097
00104 function &filterKeywords($text, $allowWildcards = false) {
00105 $minLength = Config::getVar('search', 'min_word_length');
00106 $stopwords =& PaperSearchIndex::loadStopwords();
00107
00108
00109 if (is_array($text)) $text = join("\n", $text);
00110
00111 $cleanText = Core::cleanVar($text);
00112
00113
00114 $cleanText = String::regexp_replace('/[!"\#\$%\'\(\)\.\?@\[\]\^`\{\}~]/', '', $cleanText);
00115 $cleanText = String::regexp_replace('/[\+,:;&\/<=>\|\\\]/', ' ', $cleanText);
00116 $cleanText = String::regexp_replace('/[\*]/', $allowWildcards ? '%' : ' ', $cleanText);
00117 $cleanText = String::strtolower($cleanText);
00118
00119 // Split into words
00120 $words = preg_split('/\s+/', $cleanText);
00121
00122 // FIXME Do not perform further filtering for some fields, e.g., author names?
00123
00124 // Remove stopwords
00125 $keywords = array();
00126 foreach ($words as $k) {
00127 if (!isset($stopwords[$k]) && String::strlen($k) >= $minLength && !is_numeric($k)) {
00128 $keywords[] = String::substr($k, 0, SEARCH_KEYWORD_MAX_LENGTH);
00129 }
00130 }
00131 return $keywords;
00132 }
00133
00139 function &loadStopwords() {
00140 static $searchStopwords;
00141
00142 if (!isset($searchStopwords)) {
00143 // Load stopwords only once per request (FIXME Cache?)
00144 $searchStopwords = array_count_values(array_filter(file(Config::getVar('general', 'registry_dir') . '/stopwords.txt'), create_function('&$a', 'return ($a = trim($a)) && !empty($a) && $a[0] != \'#\';')));
00145 $searchStopwords[''] = 1;
00146 }
00147
00148 return $searchStopwords;
00149 }
00150
00155 function indexPaperMetadata(&$paper) {
00156 // Build author keywords
00157 $authorText = array();
00158 $authors = $paper->getAuthors();
00159 for ($i=0, $count=count($authors); $i < $count; $i++) {
00160 $author =& $authors[$i];
00161 array_push($authorText, $author->getFirstName());
00162 array_push($authorText, $author->getMiddleName());
00163 array_push($authorText, $author->getLastName());
00164 array_push($authorText, $author->getAffiliation());
00165 $bios = $author->getBiography(null);
00166 if (is_array($bios)) foreach ($bios as $bio) { // Localized
00167 array_push($authorText, strip_tags($bio));
00168 }
00169 }
00170
00171 // Update search index
00172 $paperId = $paper->getId();
00173 PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_AUTHOR, $authorText);
00174 PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_TITLE, $paper->getTitle(null));
00175
00176 $trackDao =& DAORegistry::getDAO('TrackDAO');
00177 $track =& $trackDao->getTrack($paper->getTrackId());
00178 PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_ABSTRACT, $paper->getAbstract(null));
00179 PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_DISCIPLINE, $paper->getDiscipline(null));
00180 PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_SUBJECT, array_merge(array_values((array) $paper->getSubjectClass(null)), array_values((array) $paper->getSubject(null))));
00181 PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_TYPE, $paper->getType(null));
00182 PaperSearchIndex::updateTextIndex(
00183 $paperId,
00184 PAPER_SEARCH_COVERAGE,
00185 array_merge(
00186 array_values((array) $paper->getCoverageGeo(null)),
00187 array_values((array) $paper->getCoverageChron(null)),
00188 array_values((array) $paper->getCoverageSample(null))
00189 )
00190 );
00191 // FIXME Index sponsors too?
00192 }
00193
00198 function indexSuppFileMetadata(&$suppFile) {
00199 // Update search index
00200 $paperId = $suppFile->getPaperId();
00201 PaperSearchIndex::updateTextIndex(
00202 $paperId,
00203 PAPER_SEARCH_SUPPLEMENTARY_FILE,
00204 array_merge(
00205 array_values((array) $suppFile->getTitle(null)),
00206 array_values((array) $suppFile->getCreator(null)),
00207 array_values((array) $suppFile->getSubject(null)),
00208 array_values((array) $suppFile->getTypeOther(null)),
00209 array_values((array) $suppFile->getDescription(null)),
00210 array_values((array) $suppFile->getSource(null))
00211 ),
00212 $suppFile->getFileId()
00213 );
00214 }
00215
00220 function indexPaperFiles(&$paper) {
00221 // Index supplementary files
00222 $fileDao =& DAORegistry::getDAO('SuppFileDAO');
00223 $files =& $fileDao->getSuppFilesByPaper($paper->getId());
00224 foreach ($files as $file) {
00225 if ($file->getFileId()) {
00226 PaperSearchIndex::updateFileIndex($paper->getId(), PAPER_SEARCH_SUPPLEMENTARY_FILE, $file->getFileId());
00227 }
00228 PaperSearchIndex::indexSuppFileMetadata($file);
00229 }
00230 unset($files);
00231
00232 // Index galley files
00233 $fileDao =& DAORegistry::getDAO('PaperGalleyDAO');
00234 $files =& $fileDao->getGalleysByPaper($paper->getId());
00235 foreach ($files as $file) {
00236 if ($file->getFileId()) {
00237 PaperSearchIndex::updateFileIndex($paper->getId(), PAPER_SEARCH_GALLEY_FILE, $file->getFileId());
00238 }
00239 }
00240 }
00241
00245 function rebuildIndex($log = false) {
00246 // Clear index
00247 if ($log) echo 'Clearing index ... ';
00248 $searchDao =& DAORegistry::getDAO('PaperSearchDAO');
00249 // FIXME Abstract into PaperSearchDAO?
00250 $searchDao->update('DELETE FROM paper_search_object_keywords');
00251 $searchDao->update('DELETE FROM paper_search_objects');
00252 $searchDao->update('DELETE FROM paper_search_keyword_list');
00253 $searchDao->setCacheDir(Config::getVar('files', 'files_dir') . '/_db');
00254 $searchDao->_dataSource->CacheFlush();
00255 if ($log) echo "done\n";
00256
00257 // Build index
00258 $schedConfDao =& DAORegistry::getDAO('SchedConfDAO');
00259 $paperDao =& DAORegistry::getDAO('PaperDAO');
00260
00261 $schedConfs =& $schedConfDao->getSchedConfs();
00262 while (!$schedConfs->eof()) {
00263 $schedConf =& $schedConfs->next();
00264 $numIndexed = 0;
00265
00266 if ($log) echo "Indexing \"", $schedConf->getFullTitle(), "\" ... ";
00267
00268 $papers =& $paperDao->getPapersBySchedConfId($schedConf->getId());
00269 while (!$papers->eof()) {
00270 $paper =& $papers->next();
00271 if ($paper->getDateSubmitted()) {
00272 PaperSearchIndex::indexPaperMetadata($paper);
00273 PaperSearchIndex::indexPaperFiles($paper);
00274 $numIndexed++;
00275 }
00276 unset($paper);
00277 }
00278
00279 if ($log) echo $numIndexed, " papers indexed\n";
00280 unset($schedConf);
00281 }
00282 }
00283
00284 }
00285
00286 ?>