00001 <?php
00002
00015
00016
00017
00018 import('search.SearchFileParser');
00019 import('search.SearchHTMLParser');
00020 import('search.SearchHelperParser');
00021
00022 define('SEARCH_STOPWORDS_FILE', 'registry/stopwords.txt');
00023
00024
00025 define('SEARCH_KEYWORD_MAX_LENGTH', 40);
00026
00027 class ArticleSearchIndex {
00028
00035 function indexObjectKeywords($objectId, $text, &$position) {
00036 $searchDao = &DAORegistry::getDAO('ArticleSearchDAO');
00037 $keywords = &ArticleSearchIndex::filterKeywords($text);
00038 for ($i = 0, $count = count($keywords); $i < $count; $i++) {
00039 if ($searchDao->insertObjectKeyword($objectId, $keywords[$i], $position) !== null) {
00040 $position += 1;
00041 }
00042 }
00043 }
00044
00052 function updateTextIndex($articleId, $type, $text, $assocId = null) {
00053 $searchDao = &DAORegistry::getDAO('ArticleSearchDAO');
00054 $objectId = $searchDao->insertObject($articleId, $type, $assocId);
00055 $position = 0;
00056 ArticleSearchIndex::indexObjectKeywords($objectId, $text, $position);
00057 }
00058
00065 function updateFileIndex($articleId, $type, $fileId) {
00066 import('file.ArticleFileManager');
00067 $fileMgr = &new ArticleFileManager($articleId);
00068 $file = &$fileMgr->getFile($fileId);
00069
00070 if (isset($file)) {
00071 $parser = &SearchFileParser::fromFile($file);
00072 }
00073
00074 if (isset($parser)) {
00075 if ($parser->open()) {
00076 $searchDao = &DAORegistry::getDAO('ArticleSearchDAO');
00077 $objectId = $searchDao->insertObject($articleId, $type, $fileId);
00078
00079 $position = 0;
00080 while(($text = $parser->read()) !== false) {
00081 ArticleSearchIndex::indexObjectKeywords($objectId, $text, $position);
00082 }
00083 $parser->close();
00084 }
00085 }
00086 }
00087
00094 function deleteTextIndex($articleId, $type = null, $assocId = null) {
00095 $searchDao = &DAORegistry::getDAO('ArticleSearchDAO');
00096 return $searchDao->deleteArticleKeywords($articleId, $type, $assocId);
00097 }
00098
00105 function &filterKeywords($text, $allowWildcards = false) {
00106 $minLength = Config::getVar('search', 'min_word_length');
00107 $stopwords = &ArticleSearchIndex::loadStopwords();
00108
00109
00110 if (is_array($text)) $text = join("\n", $text);
00111
00112 $cleanText = Core::cleanVar($text);
00113
00114
00115 $cleanText = String::regexp_replace('/[!"\#\$%\'\(\)\.\?@\[\]\^`\{\}~]/', '', $cleanText);
00116 $cleanText = String::regexp_replace('/[\+,:;&\/<=>\|\\\]/', ' ', $cleanText);
00117 $cleanText = String::regexp_replace('/[\*]/', $allowWildcards ? '%' : ' ', $cleanText);
00118 $cleanText = String::strtolower($cleanText);
00119
00120
00121 $words = String::regexp_split('/\s+/', $cleanText);
00122
00123
00124
00125
00126 $keywords = array();
00127 foreach ($words as $k) {
00128 if (!isset($stopwords[$k]) && String::strlen($k) >= $minLength && !is_numeric($k)) {
00129 $keywords[] = String::substr($k, 0, SEARCH_KEYWORD_MAX_LENGTH);
00130 }
00131 }
00132 return $keywords;
00133 }
00134
00140 function &loadStopwords() {
00141 static $searchStopwords;
00142
00143 if (!isset($searchStopwords)) {
00144
00145 $searchStopwords = array_count_values(array_filter(file(SEARCH_STOPWORDS_FILE), create_function('&$a', 'return ($a = trim($a)) && !empty($a) && $a[0] != \'#\';')));
00146 $searchStopwords[''] = 1;
00147 }
00148
00149 return $searchStopwords;
00150 }
00151
00156 function indexArticleMetadata(&$article) {
00157
00158 $authorText = array();
00159 $authors = $article->getAuthors();
00160 for ($i=0, $count=count($authors); $i < $count; $i++) {
00161 $author = &$authors[$i];
00162 array_push($authorText, $author->getFirstName());
00163 array_push($authorText, $author->getMiddleName());
00164 array_push($authorText, $author->getLastName());
00165 array_push($authorText, $author->getAffiliation());
00166 $bios = $author->getBiography(null);
00167 if (is_array($bios)) foreach ($bios as $bio) {
00168 array_push($authorText, strip_tags($bio));
00169 }
00170 }
00171
00172
00173 $articleId = $article->getArticleId();
00174 ArticleSearchIndex::updateTextIndex($articleId, ARTICLE_SEARCH_AUTHOR, $authorText);
00175 ArticleSearchIndex::updateTextIndex($articleId, ARTICLE_SEARCH_TITLE, $article->getTitle(null));
00176 ArticleSearchIndex::updateTextIndex($articleId, ARTICLE_SEARCH_ABSTRACT, $article->getAbstract(null));
00177
00178 ArticleSearchIndex::updateTextIndex($articleId, ARTICLE_SEARCH_DISCIPLINE, (array) $article->getDiscipline(null));
00179 ArticleSearchIndex::updateTextIndex($articleId, ARTICLE_SEARCH_SUBJECT, array_merge(array_values((array) $article->getSubjectClass(null)), array_values((array) $article->getSubject(null))));
00180 ArticleSearchIndex::updateTextIndex($articleId, ARTICLE_SEARCH_TYPE, $article->getType(null));
00181 ArticleSearchIndex::updateTextIndex($articleId, ARTICLE_SEARCH_COVERAGE, array_merge(array_values((array) $article->getCoverageGeo(null)), array_values((array) $article->getCoverageChron(null)), array_values((array) $article->getCoverageSample(null))));
00182
00183 }
00184
00189 function indexSuppFileMetadata(&$suppFile) {
00190
00191 $articleId = $suppFile->getArticleId();
00192 ArticleSearchIndex::updateTextIndex(
00193 $articleId,
00194 ARTICLE_SEARCH_SUPPLEMENTARY_FILE,
00195 array_merge(
00196 array_values((array) $suppFile->getTitle(null)),
00197 array_values((array) $suppFile->getCreator(null)),
00198 array_values((array) $suppFile->getSubject(null)),
00199 array_values((array) $suppFile->getTypeOther(null)),
00200 array_values((array) $suppFile->getDescription(null)),
00201 array_values((array) $suppFile->getSource(null))
00202 ),
00203 $suppFile->getFileId()
00204 );
00205 }
00206
00211 function indexArticleFiles(&$article) {
00212
00213 $fileDao = &DAORegistry::getDAO('SuppFileDAO');
00214 $files = &$fileDao->getSuppFilesByArticle($article->getArticleId());
00215 foreach ($files as $file) {
00216 if ($file->getFileId()) {
00217 ArticleSearchIndex::updateFileIndex($article->getArticleId(), ARTICLE_SEARCH_SUPPLEMENTARY_FILE, $file->getFileId());
00218 }
00219 ArticleSearchIndex::indexSuppFileMetadata($file);
00220 }
00221 unset($files);
00222
00223
00224 $fileDao = &DAORegistry::getDAO('ArticleGalleyDAO');
00225 $files = &$fileDao->getGalleysByArticle($article->getArticleId());
00226 foreach ($files as $file) {
00227 if ($file->getFileId()) {
00228 ArticleSearchIndex::updateFileIndex($article->getArticleId(), ARTICLE_SEARCH_GALLEY_FILE, $file->getFileId());
00229 }
00230 }
00231 }
00232
00236 function rebuildIndex($log = false) {
00237
00238 if ($log) echo 'Clearing index ... ';
00239 $searchDao = &DAORegistry::getDAO('ArticleSearchDAO');
00240
00241 $searchDao->update('DELETE FROM article_search_object_keywords');
00242 $searchDao->update('DELETE FROM article_search_objects');
00243 $searchDao->update('DELETE FROM article_search_keyword_list');
00244 $searchDao->setCacheDir(Config::getVar('files', 'files_dir') . '/_db');
00245 $searchDao->_dataSource->CacheFlush();
00246 if ($log) echo "done\n";
00247
00248
00249 $journalDao = &DAORegistry::getDAO('JournalDAO');
00250 $articleDao = &DAORegistry::getDAO('ArticleDAO');
00251
00252 $journals = &$journalDao->getJournals();
00253 while (!$journals->eof()) {
00254 $journal = &$journals->next();
00255 $numIndexed = 0;
00256
00257 if ($log) echo "Indexing \"", $journal->getJournalTitle(), "\" ... ";
00258
00259 $articles = &$articleDao->getArticlesByJournalId($journal->getJournalId());
00260 while (!$articles->eof()) {
00261 $article = &$articles->next();
00262 if ($article->getDateSubmitted()) {
00263 ArticleSearchIndex::indexArticleMetadata($article);
00264 ArticleSearchIndex::indexArticleFiles($article);
00265 $numIndexed++;
00266 }
00267 unset($article);
00268 }
00269
00270 if ($log) echo $numIndexed, " articles indexed\n";
00271 unset($journal);
00272 }
00273 }
00274
00275 }
00276
00277 ?>