classes/search/PaperSearchIndex.inc.php

Go to the documentation of this file.
00001 <?php
00002 
00016 //$Id$
00017 
00018 import('search.SearchFileParser');
00019 import('search.SearchHTMLParser');
00020 import('search.SearchHelperParser');
00021 import('search.PaperSearch');
00022 
00023 // Words are truncated to at most this length
00024 define('SEARCH_KEYWORD_MAX_LENGTH', 40);
00025 
00026 class PaperSearchIndex {
00027 
00034    function indexObjectKeywords($objectId, $text, &$position) {
00035       $searchDao =& DAORegistry::getDAO('PaperSearchDAO');
00036       $keywords =& PaperSearchIndex::filterKeywords($text);
00037       for ($i = 0, $count = count($keywords); $i < $count; $i++) {
00038          if ($searchDao->insertObjectKeyword($objectId, $keywords[$i], $position) !== null) {
00039             $position += 1;
00040          }
00041       }
00042    }
00043 
00051    function updateTextIndex($paperId, $type, $text, $assocId = null) {
00052          $searchDao =& DAORegistry::getDAO('PaperSearchDAO');
00053          $objectId = $searchDao->insertObject($paperId, $type, $assocId);
00054          $position = 0;
00055          PaperSearchIndex::indexObjectKeywords($objectId, $text, $position);
00056    }
00057 
00064    function updateFileIndex($paperId, $type, $fileId) {
00065       import('file.PaperFileManager');
00066       $fileMgr = new PaperFileManager($paperId);
00067       $file =& $fileMgr->getFile($fileId);
00068 
00069       if (isset($file)) {
00070          $parser =& SearchFileParser::fromFile($file);
00071       }
00072 
00073       if (isset($parser)) {
00074          if ($parser->open()) {
00075             $searchDao =& DAORegistry::getDAO('PaperSearchDAO');
00076             $objectId = $searchDao->insertObject($paperId, $type, $fileId);
00077 
00078             $position = 0;
00079             while(($text = $parser->read()) !== false) {
00080                PaperSearchIndex::indexObjectKeywords($objectId, $text, $position);
00081             }
00082             $parser->close();
00083          }
00084       }
00085    }
00086 
00093    function deleteTextIndex($paperId, $type = null, $assocId = null) {
00094       $searchDao =& DAORegistry::getDAO('PaperSearchDAO');
00095       return $searchDao->deletePaperKeywords($paperId, $type, $assocId);
00096    }
00097 
00104    function &filterKeywords($text, $allowWildcards = false) {
00105       $minLength = Config::getVar('search', 'min_word_length');
00106       $stopwords =& PaperSearchIndex::loadStopwords();
00107 
00108       // Join multiple lines into a single string
00109       if (is_array($text)) $text = join("\n", $text);
00110 
00111       $cleanText = Core::cleanVar($text);
00112 
00113       // Remove punctuation
00114       $cleanText = String::regexp_replace('/[!"\#\$%\'\(\)\.\?@\[\]\^`\{\}~]/', '', $cleanText);
00115       $cleanText = String::regexp_replace('/[\+,:;&\/<=>\|\\\]/', ' ', $cleanText);
00116       $cleanText = String::regexp_replace('/[\*]/', $allowWildcards ? '%' : ' ', $cleanText);
00117       $cleanText = String::strtolower($cleanText);
00118 
00119       // Split into words
00120       $words = preg_split('/\s+/', $cleanText);
00121 
00122       // FIXME Do not perform further filtering for some fields, e.g., author names?
00123 
00124       // Remove stopwords
00125       $keywords = array();
00126       foreach ($words as $k) {
00127          if (!isset($stopwords[$k]) && String::strlen($k) >= $minLength && !is_numeric($k)) {
00128             $keywords[] = String::substr($k, 0, SEARCH_KEYWORD_MAX_LENGTH);
00129          }
00130       }
00131       return $keywords;
00132    }
00133 
00139    function &loadStopwords() {
00140       static $searchStopwords;
00141 
00142       if (!isset($searchStopwords)) {
00143          // Load stopwords only once per request (FIXME Cache?)
00144          $searchStopwords = array_count_values(array_filter(file(Config::getVar('general', 'registry_dir') . '/stopwords.txt'), create_function('&$a', 'return ($a = trim($a)) && !empty($a) && $a[0] != \'#\';')));
00145          $searchStopwords[''] = 1;
00146       }
00147 
00148       return $searchStopwords;
00149    }
00150 
00155    function indexPaperMetadata(&$paper) {
00156       // Build author keywords
00157       $authorText = array();
00158       $authors = $paper->getAuthors();
00159       for ($i=0, $count=count($authors); $i < $count; $i++) {
00160          $author =& $authors[$i];
00161          array_push($authorText, $author->getFirstName());
00162          array_push($authorText, $author->getMiddleName());
00163          array_push($authorText, $author->getLastName());
00164          array_push($authorText, $author->getAffiliation());
00165          $bios = $author->getBiography(null);
00166          if (is_array($bios)) foreach ($bios as $bio) { // Localized
00167             array_push($authorText, strip_tags($bio));
00168          }
00169       }
00170 
00171       // Update search index
00172       $paperId = $paper->getId();
00173       PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_AUTHOR, $authorText);
00174       PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_TITLE, $paper->getTitle(null));
00175 
00176       $trackDao =& DAORegistry::getDAO('TrackDAO');
00177       $track =& $trackDao->getTrack($paper->getTrackId());
00178       PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_ABSTRACT, $paper->getAbstract(null));
00179       PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_DISCIPLINE, $paper->getDiscipline(null));
00180       PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_SUBJECT, array_merge(array_values((array) $paper->getSubjectClass(null)), array_values((array) $paper->getSubject(null))));
00181       PaperSearchIndex::updateTextIndex($paperId, PAPER_SEARCH_TYPE, $paper->getType(null));
00182       PaperSearchIndex::updateTextIndex(
00183          $paperId,
00184          PAPER_SEARCH_COVERAGE,
00185          array_merge(
00186             array_values((array) $paper->getCoverageGeo(null)),
00187             array_values((array) $paper->getCoverageChron(null)),
00188             array_values((array) $paper->getCoverageSample(null))
00189          )
00190       );
00191       // FIXME Index sponsors too?
00192    }
00193 
00198    function indexSuppFileMetadata(&$suppFile) {
00199       // Update search index
00200       $paperId = $suppFile->getPaperId();
00201       PaperSearchIndex::updateTextIndex(
00202          $paperId,
00203          PAPER_SEARCH_SUPPLEMENTARY_FILE,
00204          array_merge(
00205             array_values((array) $suppFile->getTitle(null)),
00206             array_values((array) $suppFile->getCreator(null)),
00207             array_values((array) $suppFile->getSubject(null)),
00208             array_values((array) $suppFile->getTypeOther(null)),
00209             array_values((array) $suppFile->getDescription(null)),
00210             array_values((array) $suppFile->getSource(null))
00211          ),
00212          $suppFile->getFileId()
00213       );
00214    }
00215 
00220    function indexPaperFiles(&$paper) {
00221       // Index supplementary files
00222       $fileDao =& DAORegistry::getDAO('SuppFileDAO');
00223       $files =& $fileDao->getSuppFilesByPaper($paper->getId());
00224       foreach ($files as $file) {
00225          if ($file->getFileId()) {
00226             PaperSearchIndex::updateFileIndex($paper->getId(), PAPER_SEARCH_SUPPLEMENTARY_FILE, $file->getFileId());
00227          }
00228          PaperSearchIndex::indexSuppFileMetadata($file);
00229       }
00230       unset($files);
00231 
00232       // Index galley files
00233       $fileDao =& DAORegistry::getDAO('PaperGalleyDAO');
00234       $files =& $fileDao->getGalleysByPaper($paper->getId());
00235       foreach ($files as $file) {
00236          if ($file->getFileId()) {
00237             PaperSearchIndex::updateFileIndex($paper->getId(), PAPER_SEARCH_GALLEY_FILE, $file->getFileId());
00238          }
00239       }
00240    }
00241 
00245    function rebuildIndex($log = false) {
00246       // Clear index
00247       if ($log) echo 'Clearing index ... ';
00248       $searchDao =& DAORegistry::getDAO('PaperSearchDAO');
00249       // FIXME Abstract into PaperSearchDAO?
00250       $searchDao->update('DELETE FROM paper_search_object_keywords');
00251       $searchDao->update('DELETE FROM paper_search_objects');
00252       $searchDao->update('DELETE FROM paper_search_keyword_list');
00253       $searchDao->setCacheDir(Config::getVar('files', 'files_dir') . '/_db');
00254       $searchDao->_dataSource->CacheFlush();
00255       if ($log) echo "done\n";
00256 
00257       // Build index
00258       $schedConfDao =& DAORegistry::getDAO('SchedConfDAO');
00259       $paperDao =& DAORegistry::getDAO('PaperDAO');
00260 
00261       $schedConfs =& $schedConfDao->getSchedConfs();
00262       while (!$schedConfs->eof()) {
00263          $schedConf =& $schedConfs->next();
00264          $numIndexed = 0;
00265 
00266          if ($log) echo "Indexing \"", $schedConf->getFullTitle(), "\" ... ";
00267 
00268          $papers =& $paperDao->getPapersBySchedConfId($schedConf->getId());
00269          while (!$papers->eof()) {
00270             $paper =& $papers->next();
00271             if ($paper->getDateSubmitted()) {
00272                PaperSearchIndex::indexPaperMetadata($paper);
00273                PaperSearchIndex::indexPaperFiles($paper);
00274                $numIndexed++;
00275             }
00276             unset($paper);
00277          }
00278 
00279          if ($log) echo $numIndexed, " papers indexed\n";
00280          unset($schedConf);
00281       }
00282    }
00283 
00284 }
00285 
00286 ?>

Generated on 25 Jul 2013 for Open Conference Systems by  doxygen 1.4.7