• Main Page
  • Modules
  • Classes
  • Files
  • File List

classes/search/MonographSearchIndex.inc.php

00001 <?php
00002 
00017 import('lib.pkp.classes.search.SearchFileParser');
00018 import('lib.pkp.classes.search.SearchHTMLParser');
00019 import('lib.pkp.classes.search.SearchHelperParser');
00020 
00021 define('SEARCH_STOPWORDS_FILE', 'lib/pkp/registry/stopwords.txt');
00022 
00023 // Words are truncated to at most this length
00024 define('SEARCH_KEYWORD_MAX_LENGTH', 40);
00025 
00026 class MonographSearchIndex {
00027 
00034    function indexObjectKeywords($objectId, $text, &$position) {
00035       $searchDao =& DAORegistry::getDAO('MonographSearchDAO');
00036       $keywords =& MonographSearchIndex::filterKeywords($text);
00037       for ($i = 0, $count = count($keywords); $i < $count; $i++) {
00038          if ($searchDao->insertObjectKeyword($objectId, $keywords[$i], $position) !== null) {
00039             $position += 1;
00040          }
00041       }
00042    }
00043 
00051    function updateTextIndex($monographId, $type, $text, $assocId = null) {
00052          $searchDao =& DAORegistry::getDAO('MonographSearchDAO');
00053          $objectId = $searchDao->insertObject($monographId, $type, $assocId);
00054          $position = 0;
00055          MonographSearchIndex::indexObjectKeywords($objectId, $text, $position);
00056    }
00057 
00064    function updateFileIndex($monographId, $type, $fileId) {
00065       $submissionFileDao =& DAORegistry::getDAO('SubmissionFileDAO'); /* @var $submissionFileDao SubmissionFileDAO */
00066       $file =& $submissionFileDao->getLatestRevision($fileId);
00067 
00068       if (isset($file)) {
00069          $parser =& SearchFileParser::fromFile($file);
00070       }
00071 
00072       if (isset($parser)) {
00073          if ($parser->open()) {
00074             $searchDao =& DAORegistry::getDAO('MonographSearchDAO');
00075             $objectId = $searchDao->insertObject($monographId, $type, $fileId);
00076 
00077             $position = 0;
00078             while(($text = $parser->read()) !== false) {
00079                MonographSearchIndex::indexObjectKeywords($objectId, $text, $position);
00080             }
00081             $parser->close();
00082          } else {
00083             // cannot open parser; unsupported format?
00084          }
00085       }
00086    }
00087 
00094    function deleteTextIndex($monographId, $type = null, $assocId = null) {
00095       $searchDao =& DAORegistry::getDAO('MonographSearchDAO');
00096       return $searchDao->deleteMonographKeywords($monographId, $type, $assocId);
00097    }
00098 
00105    function &filterKeywords($text, $allowWildcards = false) {
00106       $minLength = Config::getVar('search', 'min_word_length');
00107       $stopwords =& MonographSearchIndex::loadStopwords();
00108 
00109       // Join multiple lines into a single string
00110       if (is_array($text)) $text = join("\n", $text);
00111 
00112       $cleanText = Core::cleanVar($text);
00113 
00114       // Remove punctuation
00115       $cleanText = String::regexp_replace('/[!"\#\$%\'\(\)\.\?@\[\]\^`\{\}~]/', '', $cleanText);
00116       $cleanText = String::regexp_replace('/[\+,:;&\/<=>\|\\\]/', ' ', $cleanText);
00117       $cleanText = String::regexp_replace('/[\*]/', $allowWildcards ? '%' : ' ', $cleanText);
00118       $cleanText = String::strtolower($cleanText);
00119 
00120       // Split into words
00121       $words = String::regexp_split('/\s+/', $cleanText);
00122 
00123       // FIXME Do not perform further filtering for some fields, e.g., author names?
00124 
00125       // Remove stopwords
00126       $keywords = array();
00127       foreach ($words as $k) {
00128          if (!isset($stopwords[$k]) && String::strlen($k) >= $minLength && !is_numeric($k)) {
00129             $keywords[] = String::substr($k, 0, SEARCH_KEYWORD_MAX_LENGTH);
00130          }
00131       }
00132       return $keywords;
00133    }
00134 
00140    function &loadStopwords() {
00141       static $searchStopwords;
00142 
00143       if (!isset($searchStopwords)) {
00144          // Load stopwords only once per request (FIXME Cache?)
00145          $searchStopwords = array_count_values(array_filter(file(SEARCH_STOPWORDS_FILE), create_function('&$a', 'return ($a = trim($a)) && !empty($a) && $a[0] != \'#\';')));
00146          $searchStopwords[''] = 1;
00147       }
00148 
00149       return $searchStopwords;
00150    }
00151 
00156    function indexMonographMetadata(&$monograph) {
00157       // Build author keywords
00158       $authorText = array();
00159       $authorDao =& DAORegistry::getDAO('AuthorDAO');
00160       $authors = $authorDao->getAuthorsBySubmissionId($monograph->getId());
00161       foreach ($authors as $author) {
00162          array_push($authorText, $author->getFirstName());
00163          array_push($authorText, $author->getMiddleName());
00164          array_push($authorText, $author->getLastName());
00165          $affiliations = $author->getAffiliation(null);
00166          if (is_array($affiliations)) foreach ($affiliations as $affiliation) { // Localized
00167             array_push($authorText, strip_tags($affiliation));
00168          }
00169          $bios = $author->getBiography(null);
00170          if (is_array($bios)) foreach ($bios as $bio) { // Localized
00171             array_push($authorText, strip_tags($bio));
00172          }
00173       }
00174 
00175       // Update search index
00176       import('classes.search.MonographSearch');
00177       $monographId = $monograph->getId();
00178       MonographSearchIndex::updateTextIndex($monographId, MONOGRAPH_SEARCH_AUTHOR, $authorText);
00179       MonographSearchIndex::updateTextIndex($monographId, MONOGRAPH_SEARCH_TITLE, $monograph->getTitle(null));
00180       MonographSearchIndex::updateTextIndex($monographId, MONOGRAPH_SEARCH_ABSTRACT, $monograph->getAbstract(null));
00181 
00182       MonographSearchIndex::updateTextIndex($monographId, MONOGRAPH_SEARCH_DISCIPLINE, (array) $monograph->getDiscipline(null));
00183       MonographSearchIndex::updateTextIndex($monographId, MONOGRAPH_SEARCH_SUBJECT, array_merge(array_values((array) $monograph->getSubjectClass(null)), array_values((array) $monograph->getSubject(null))));
00184       MonographSearchIndex::updateTextIndex($monographId, MONOGRAPH_SEARCH_TYPE, $monograph->getType(null));
00185       MonographSearchIndex::updateTextIndex($monographId, MONOGRAPH_SEARCH_COVERAGE, array_merge(array_values((array) $monograph->getCoverageGeo(null)), array_values((array) $monograph->getCoverageChron(null)), array_values((array) $monograph->getCoverageSample(null))));
00186       // FIXME Index sponsors too?
00187    }
00188 
00193    function indexMonographFiles(&$monograph) {
00194       // Index galley files
00195       $submissionFileDao =& DAORegistry::getDAO('SubmissionFileDAO'); /* @var $submissionFileDao SubmissionFileDAO */
00196       import('classes.monograph.MonographFile'); // Constants
00197       import('classes.search.MonographSearch'); // Constants
00198       $files =& $submissionFileDao->getLatestRevisions($monograph->getId(), MONOGRAPH_FILE_PROOF);
00199 
00200       foreach ($files as $file) {
00201          if ($file->getFileId() && $file->getViewable()) {
00202             MonographSearchIndex::updateFileIndex($monograph->getId(), MONOGRAPH_SEARCH_GALLEY_FILE, $file->getFileId());
00203          }
00204       }
00205    }
00206 
00211    function clearMonographFiles(&$monograph) {
00212       $searchDao =& DAORegistry::getDAO('MonographSearchDAO');
00213       $searchDao->deleteMonographKeywords($monograph->getId(), MONOGRAPH_SEARCH_GALLEY_FILE);
00214    }
00215 
00219    function rebuildIndex($log = false) {
00220       // Clear index
00221       if ($log) echo 'Clearing index ... ';
00222       $searchDao =& DAORegistry::getDAO('MonographSearchDAO');
00223       // FIXME Abstract into MonographSearchDAO?
00224       $searchDao->update('DELETE FROM monograph_search_object_keywords');
00225       $searchDao->update('DELETE FROM monograph_search_objects');
00226       $searchDao->update('DELETE FROM monograph_search_keyword_list');
00227       $searchDao->setCacheDir(Config::getVar('files', 'files_dir') . '/_db');
00228       $searchDao->_dataSource->CacheFlush();
00229       if ($log) echo "done\n";
00230 
00231       // Build index
00232       $pressDao =& DAORegistry::getDAO('PressDAO');
00233       $monographDao =& DAORegistry::getDAO('MonographDAO');
00234 
00235       $presses =& $pressDao->getPresses();
00236       while (!$presses->eof()) {
00237          $press =& $presses->next();
00238          $numIndexed = 0;
00239 
00240          if ($log) echo "Indexing \"", $press->getLocalizedName(), "\" ... ";
00241 
00242          $monographs =& $monographDao->getByPressId($press->getId());
00243          while (!$monographs->eof()) {
00244             $monograph =& $monographs->next();
00245             if ($monograph->getDatePublished()) {
00246                MonographSearchIndex::indexMonographMetadata($monograph);
00247                MonographSearchIndex::indexMonographFiles($monograph);
00248                $numIndexed++;
00249             }
00250             unset($monograph);
00251          }
00252 
00253          if ($log) echo $numIndexed, " monographs indexed\n";
00254          unset($press);
00255       }
00256    }
00257 
00258 }
00259 
00260 ?>

Generated on Mon Sep 17 2012 13:58:55 for Open Monograph Press by  doxygen 1.7.1