00001 <?php
00002
00017 import('lib.pkp.classes.search.SearchFileParser');
00018 import('lib.pkp.classes.search.SearchHTMLParser');
00019 import('lib.pkp.classes.search.SearchHelperParser');
00020
00021 define('SEARCH_STOPWORDS_FILE', 'lib/pkp/registry/stopwords.txt');
00022
00023
00024 define('SEARCH_KEYWORD_MAX_LENGTH', 40);
00025
00026 class MonographSearchIndex {
00027
00034 function indexObjectKeywords($objectId, $text, &$position) {
00035 $searchDao =& DAORegistry::getDAO('MonographSearchDAO');
00036 $keywords =& MonographSearchIndex::filterKeywords($text);
00037 for ($i = 0, $count = count($keywords); $i < $count; $i++) {
00038 if ($searchDao->insertObjectKeyword($objectId, $keywords[$i], $position) !== null) {
00039 $position += 1;
00040 }
00041 }
00042 }
00043
00051 function updateTextIndex($monographId, $type, $text, $assocId = null) {
00052 $searchDao =& DAORegistry::getDAO('MonographSearchDAO');
00053 $objectId = $searchDao->insertObject($monographId, $type, $assocId);
00054 $position = 0;
00055 MonographSearchIndex::indexObjectKeywords($objectId, $text, $position);
00056 }
00057
00064 function updateFileIndex($monographId, $type, $fileId) {
00065 $submissionFileDao =& DAORegistry::getDAO('SubmissionFileDAO');
00066 $file =& $submissionFileDao->getLatestRevision($fileId);
00067
00068 if (isset($file)) {
00069 $parser =& SearchFileParser::fromFile($file);
00070 }
00071
00072 if (isset($parser)) {
00073 if ($parser->open()) {
00074 $searchDao =& DAORegistry::getDAO('MonographSearchDAO');
00075 $objectId = $searchDao->insertObject($monographId, $type, $fileId);
00076
00077 $position = 0;
00078 while(($text = $parser->read()) !== false) {
00079 MonographSearchIndex::indexObjectKeywords($objectId, $text, $position);
00080 }
00081 $parser->close();
00082 } else {
00083
00084 }
00085 }
00086 }
00087
00094 function deleteTextIndex($monographId, $type = null, $assocId = null) {
00095 $searchDao =& DAORegistry::getDAO('MonographSearchDAO');
00096 return $searchDao->deleteMonographKeywords($monographId, $type, $assocId);
00097 }
00098
00105 function &filterKeywords($text, $allowWildcards = false) {
00106 $minLength = Config::getVar('search', 'min_word_length');
00107 $stopwords =& MonographSearchIndex::loadStopwords();
00108
00109
00110 if (is_array($text)) $text = join("\n", $text);
00111
00112 $cleanText = Core::cleanVar($text);
00113
00114
00115 $cleanText = String::regexp_replace('/[!"\#\$%\'\(\)\.\?@\[\]\^`\{\}~]/', '', $cleanText);
00116 $cleanText = String::regexp_replace('/[\+,:;&\/<=>\|\\\]/', ' ', $cleanText);
00117 $cleanText = String::regexp_replace('/[\*]/', $allowWildcards ? '%' : ' ', $cleanText);
00118 $cleanText = String::strtolower($cleanText);
00119
00120
00121 $words = String::regexp_split('/\s+/', $cleanText);
00122
00123
00124
00125
00126 $keywords = array();
00127 foreach ($words as $k) {
00128 if (!isset($stopwords[$k]) && String::strlen($k) >= $minLength && !is_numeric($k)) {
00129 $keywords[] = String::substr($k, 0, SEARCH_KEYWORD_MAX_LENGTH);
00130 }
00131 }
00132 return $keywords;
00133 }
00134
00140 function &loadStopwords() {
00141 static $searchStopwords;
00142
00143 if (!isset($searchStopwords)) {
00144
00145 $searchStopwords = array_count_values(array_filter(file(SEARCH_STOPWORDS_FILE), create_function('&$a', 'return ($a = trim($a)) && !empty($a) && $a[0] != \'#\';')));
00146 $searchStopwords[''] = 1;
00147 }
00148
00149 return $searchStopwords;
00150 }
00151
00156 function indexMonographMetadata(&$monograph) {
00157
00158 $authorText = array();
00159 $authorDao =& DAORegistry::getDAO('AuthorDAO');
00160 $authors = $authorDao->getAuthorsBySubmissionId($monograph->getId());
00161 foreach ($authors as $author) {
00162 array_push($authorText, $author->getFirstName());
00163 array_push($authorText, $author->getMiddleName());
00164 array_push($authorText, $author->getLastName());
00165 $affiliations = $author->getAffiliation(null);
00166 if (is_array($affiliations)) foreach ($affiliations as $affiliation) {
00167 array_push($authorText, strip_tags($affiliation));
00168 }
00169 $bios = $author->getBiography(null);
00170 if (is_array($bios)) foreach ($bios as $bio) {
00171 array_push($authorText, strip_tags($bio));
00172 }
00173 }
00174
00175
00176 import('classes.search.MonographSearch');
00177 $monographId = $monograph->getId();
00178 MonographSearchIndex::updateTextIndex($monographId, MONOGRAPH_SEARCH_AUTHOR, $authorText);
00179 MonographSearchIndex::updateTextIndex($monographId, MONOGRAPH_SEARCH_TITLE, $monograph->getTitle(null));
00180 MonographSearchIndex::updateTextIndex($monographId, MONOGRAPH_SEARCH_ABSTRACT, $monograph->getAbstract(null));
00181
00182 MonographSearchIndex::updateTextIndex($monographId, MONOGRAPH_SEARCH_DISCIPLINE, (array) $monograph->getDiscipline(null));
00183 MonographSearchIndex::updateTextIndex($monographId, MONOGRAPH_SEARCH_SUBJECT, array_merge(array_values((array) $monograph->getSubjectClass(null)), array_values((array) $monograph->getSubject(null))));
00184 MonographSearchIndex::updateTextIndex($monographId, MONOGRAPH_SEARCH_TYPE, $monograph->getType(null));
00185 MonographSearchIndex::updateTextIndex($monographId, MONOGRAPH_SEARCH_COVERAGE, array_merge(array_values((array) $monograph->getCoverageGeo(null)), array_values((array) $monograph->getCoverageChron(null)), array_values((array) $monograph->getCoverageSample(null))));
00186
00187 }
00188
00193 function indexMonographFiles(&$monograph) {
00194
00195 $submissionFileDao =& DAORegistry::getDAO('SubmissionFileDAO');
00196 import('classes.monograph.MonographFile');
00197 import('classes.search.MonographSearch');
00198 $files =& $submissionFileDao->getLatestRevisions($monograph->getId(), MONOGRAPH_FILE_PROOF);
00199
00200 foreach ($files as $file) {
00201 if ($file->getFileId() && $file->getViewable()) {
00202 MonographSearchIndex::updateFileIndex($monograph->getId(), MONOGRAPH_SEARCH_GALLEY_FILE, $file->getFileId());
00203 }
00204 }
00205 }
00206
00211 function clearMonographFiles(&$monograph) {
00212 $searchDao =& DAORegistry::getDAO('MonographSearchDAO');
00213 $searchDao->deleteMonographKeywords($monograph->getId(), MONOGRAPH_SEARCH_GALLEY_FILE);
00214 }
00215
00219 function rebuildIndex($log = false) {
00220
00221 if ($log) echo 'Clearing index ... ';
00222 $searchDao =& DAORegistry::getDAO('MonographSearchDAO');
00223
00224 $searchDao->update('DELETE FROM monograph_search_object_keywords');
00225 $searchDao->update('DELETE FROM monograph_search_objects');
00226 $searchDao->update('DELETE FROM monograph_search_keyword_list');
00227 $searchDao->setCacheDir(Config::getVar('files', 'files_dir') . '/_db');
00228 $searchDao->_dataSource->CacheFlush();
00229 if ($log) echo "done\n";
00230
00231
00232 $pressDao =& DAORegistry::getDAO('PressDAO');
00233 $monographDao =& DAORegistry::getDAO('MonographDAO');
00234
00235 $presses =& $pressDao->getPresses();
00236 while (!$presses->eof()) {
00237 $press =& $presses->next();
00238 $numIndexed = 0;
00239
00240 if ($log) echo "Indexing \"", $press->getLocalizedName(), "\" ... ";
00241
00242 $monographs =& $monographDao->getByPressId($press->getId());
00243 while (!$monographs->eof()) {
00244 $monograph =& $monographs->next();
00245 if ($monograph->getDatePublished()) {
00246 MonographSearchIndex::indexMonographMetadata($monograph);
00247 MonographSearchIndex::indexMonographFiles($monograph);
00248 $numIndexed++;
00249 }
00250 unset($monograph);
00251 }
00252
00253 if ($log) echo $numIndexed, " monographs indexed\n";
00254 unset($press);
00255 }
00256 }
00257
00258 }
00259
00260 ?>