Open Monograph Press  1.1
 All Classes Namespaces Functions Variables Groups Pages
PubmedNlm30CitationSchemaFilter.inc.php
1 <?php
2 
23 import('lib.pkp.plugins.metadata.nlm30.filter.Nlm30CitationSchemaFilter');
24 import('lib.pkp.classes.filter.EmailFilterSetting');
25 
26 define('PUBMED_WEBSERVICE_ESEARCH', 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi');
27 define('PUBMED_WEBSERVICE_EFETCH', 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi');
28 define('PUBMED_WEBSERVICE_ELINK', 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi');
29 
35  function PubmedNlm30CitationSchemaFilter($filterGroup) {
36  $this->setDisplayName('PubMed');
37 
38  // Instantiate the settings of this filter
39  $emailSetting = new EmailFilterSetting('email',
40  'metadata.filters.pubmed.settings.email.displayName',
41  'metadata.filters.pubmed.settings.email.validationMessage',
42  FORM_VALIDATOR_OPTIONAL_VALUE);
43  $this->addSetting($emailSetting);
44 
45  parent::Nlm30CitationSchemaFilter(
46  $filterGroup,
47  array(
48  NLM30_PUBLICATION_TYPE_JOURNAL,
49  NLM30_PUBLICATION_TYPE_CONFPROC
50  )
51  );
52  }
53 
54  //
55  // Getters and Setters
56  //
61  function getEmail() {
62  return $this->getData('email');
63  }
64 
65 
66  //
67  // Implement template methods from PersistableFilter
68  //
72  function getClassName() {
73  return 'lib.pkp.plugins.citationLookup.pubmed.filter.PubmedNlm30CitationSchemaFilter';
74  }
75 
76 
77  //
78  // Implement template methods from Filter
79  //
85  function &process(&$citationDescription) {
86  $pmid = $citationDescription->getStatement('pub-id[@pub-id-type="pmid"]');
87 
88  // If the citation does not have a PMID, try to get one from eSearch
89  // otherwise skip directly to eFetch.
90  if (empty($pmid)) {
91  // Initialize search result arrays.
92  $pmidArrayFromAuthorsSearch = $pmidArrayFromTitleSearch = $pmidArrayFromStrictSearch = array();
93 
94  // 1) Try a "loose" search based on the author list.
95  // (This works surprisingly well for pubmed.)
96  $authors =& $citationDescription->getStatement('person-group[@person-group-type="author"]');
97  if (is_array($authors)) {
98  import('lib.pkp.plugins.metadata.nlm30.filter.Nlm30NameSchemaPersonStringFilter');
99  $personNameFilter = new Nlm30NameSchemaPersonStringFilter(PERSON_STRING_FILTER_MULTIPLE, '%firstname%%initials%%prefix% %surname%%suffix%', ', ');
100  $authorsString = (string)$personNameFilter->execute($authors);
101  if (!empty($authorsString)) {
102  $pmidArrayFromAuthorsSearch =& $this->_search($authorsString);
103  }
104  }
105 
106  // 2) Try a "loose" search based on the article title
107  $articleTitle = (string)$citationDescription->getStatement('article-title');
108  if (!empty($articleTitle)) {
109  $pmidArrayFromTitleSearch =& $this->_search($articleTitle);
110  }
111 
112  // 3) Try a "strict" search based on as much information as possible
113  $searchProperties = array(
114  'article-title' => '',
115  'person-group[@person-group-type="author"]' => '[Auth]',
116  'source' => '[Jour]',
117  'date' => '[DP]',
118  'volume' => '[VI]',
119  'issue' => '[IP]',
120  'fpage' => '[PG]'
121  );
122  $searchTerms = '';
123  $statements = $citationDescription->getStatements();
124  foreach($searchProperties as $nlm30Property => $pubmedProperty) {
125  if (isset($statements[$nlm30Property])) {
126  if (!empty($searchTerms)) $searchTerms .= ' AND ';
127 
128  // Special treatment for authors
129  if ($nlm30Property == 'person-group[@person-group-type="author"]') {
130  assert(isset($statements['person-group[@person-group-type="author"]'][0]));
131  $firstAuthor =& $statements['person-group[@person-group-type="author"]'][0];
132 
133  // Add surname
134  $searchTerms .= (string)$firstAuthor->getStatement('surname');
135 
136  // Add initial of the first given name
137  $givenNames = $firstAuthor->getStatement('given-names');
138  if (is_array($givenNames)) $searchTerms .= ' '.String::substr($givenNames[0], 0, 1);
139  } else {
140  $searchTerms .= $citationDescription->getStatement($nlm30Property);
141  }
142 
143  $searchTerms .= $pubmedProperty;
144  }
145  }
146 
147  $pmidArrayFromStrictSearch =& $this->_search($searchTerms);
148 
149  // TODO: add another search like strict, but without article title
150  // e.g. ...term=Baumgart+Dc[Auth]+AND+Lancet[Jour]+AND+2005[DP]+AND+366[VI]+AND+9492[IP]+AND+1210[PG]
151 
152  // Compare the arrays to try to narrow it down to one PMID
153 
154  switch (true) {
155  // strict search has a single result
156  case (count($pmidArrayFromStrictSearch) == 1):
157  $pmid = $pmidArrayFromStrictSearch[0];
158  break;
159 
160  // 3-way union
161  case (count($intersect = array_intersect($pmidArrayFromTitleSearch, $pmidArrayFromAuthorsSearch, $pmidArrayFromStrictSearch)) == 1):
162  $pmid = current($intersect);
163  break;
164 
165  // 2-way union: title / strict
166  case (count($pmid_2way1 = array_intersect($pmidArrayFromTitleSearch, $pmidArrayFromStrictSearch)) == 1):
167  $pmid = current($pmid_2way1);
168  break;
169 
170  // 2-way union: authors / strict
171  case (count($pmid_2way2 = array_intersect($pmidArrayFromAuthorsSearch, $pmidArrayFromStrictSearch)) == 1):
172  $pmid = current($pmid_2way2);
173  break;
174 
175  // 2-way union: authors / title
176  case (count($pmid_2way3 = array_intersect($pmidArrayFromAuthorsSearch, $pmidArrayFromTitleSearch)) == 1):
177  $pmid = current($pmid_2way3);
178  break;
179 
180  // we only have one result for title
181  case (count($pmidArrayFromTitleSearch) == 1):
182  $pmid = $pmidArrayFromTitleSearch[0];
183  break;
184 
185  // we only have one result for authors
186  case (count($pmidArrayFromAuthorsSearch) == 1):
187  $pmid = $pmidArrayFromAuthorsSearch[0];
188  break;
189 
190  // we were unable to find a PMID
191  default:
192  $pmid = '';
193  }
194  }
195 
196  // If we have a PMID, get a metadata array for it
197  if (!empty($pmid)) {
198  $citationDescription =& $this->_lookup($pmid);
199  return $citationDescription;
200  }
201 
202  // Nothing found
203  $nullVar = null;
204  return $nullVar;
205  }
206 
207  //
208  // Private methods
209  //
216  function &_search($searchTerms) {
217  $searchParams = array(
218  'db' => 'pubmed',
219  'tool' => 'pkp-wal',
220  'term' => $searchTerms
221  );
222  if (!is_null($this->getEmail())) $searchParams['email'] = $this->getEmail();
223 
224  // Call the eSearch web service and get an XML result
225  if (is_null($resultDOM = $this->callWebService(PUBMED_WEBSERVICE_ESEARCH, $searchParams))) {
226  $emptyArray = array();
227  return $emptyArray;
228  }
229 
230  // Loop through any results we have and add them to a PMID array
231  $pmidArray = array();
232  foreach ($resultDOM->getElementsByTagName('Id') as $idNode) {
233  $pmidArray[] = $idNode->textContent;
234  }
235 
236  return $pmidArray;
237  }
238 
245  function &_lookup($pmid) {
246  $nullVar = null;
247 
248  // Use eFetch to get XML metadata for the given PMID
249  $lookupParams = array(
250  'db' => 'pubmed',
251  'mode' => 'xml',
252  'tool' => 'pkp-wal',
253  'id' => $pmid
254  );
255  if (!is_null($this->getEmail())) $lookupParams['email'] = $this->getEmail();
256 
257  // Call the eFetch URL and get an XML result
258  if (is_null($resultDOM = $this->callWebService(PUBMED_WEBSERVICE_EFETCH, $lookupParams))) return $nullVar;
259 
260  $articleTitleNodes = $resultDOM->getElementsByTagName('ArticleTitle');
261  $articleTitleFirstNode = $articleTitleNodes->item(0);
262  $medlineTaNodes = $resultDOM->getElementsByTagName('MedlineTA');
263  $medlineTaFirstNode = $medlineTaNodes->item(0);
264  $metadata = array(
265  'pub-id[@pub-id-type="pmid"]' => $pmid,
266  'article-title' =>$articleTitleFirstNode->textContent,
267  'source' => $medlineTaFirstNode->textContent,
268  );
269 
270  $volumeNodes = $resultDOM->getElementsByTagName('Volume');
271  $issueNodes = $resultDOM->getElementsByTagName('Issue');
272  if ($volumeNodes->length > 0)
273  $volumeFirstNode = $volumeNodes->item(0);
274  $metadata['volume'] = $volumeFirstNode->textContent;
275  if ($issueNodes->length > 0)
276  $issueFirstNode = $issueNodes->item(0);
277  $metadata['issue'] = $issueFirstNode->textContent;
278 
279  // Get list of author full names
280  foreach ($resultDOM->getElementsByTagName("Author") as $authorNode) {
281  if (!isset($metadata['person-group[@person-group-type="author"]']))
282  $metadata['person-group[@person-group-type="author"]'] = array();
283 
284  // Instantiate an NLM name description
285  $authorDescription = new MetadataDescription('lib.pkp.plugins.metadata.nlm30.schema.Nlm30NameSchema', ASSOC_TYPE_AUTHOR);
286 
287  // Surname
288  $lastNameNodes = $authorNode->getElementsByTagName('LastName');
289  $lastNameFirstNode = $lastNameNodes->item(0);
290  $authorDescription->addStatement('surname', $lastNameFirstNode->textContent);
291 
292  // Given names
293  $givenNamesString = '';
294  $firstNameNodes = $authorNode->getElementsByTagName('FirstName');
295  if ($firstNameNodes->length > 0) {
296  $firstNameFirstNode = $firstNameNodes->item(0);
297  $givenNamesString = $firstNameFirstNode->textContent;
298  } else {
299  $foreNameNodes = $authorNode->getElementsByTagName('ForeName');
300  if ($foreNameNodes->length > 0) {
301  $foreNameFirstNode = $foreNameNodes->item(0);
302  $givenNamesString = $foreNameFirstNode->textContent;
303  }
304  }
305  if (!empty($givenNamesString)) {
306  foreach(explode(' ', $givenNamesString) as $givenName) $authorDescription->addStatement('given-names', String::trimPunctuation($givenName));
307  }
308 
309  // Suffix
310  $suffixNodes = $authorNode->getElementsByTagName('Suffix');
311  if ($suffixNodes->length > 0) {
312  $suffixFirstNode = $suffixNodes->item(0);
313  $authorDescription->addStatement('suffix', $suffixFirstNode->textContent);
314  }
315 
316  // Include collective names
317  // FIXME: This corresponds to an NLM-citation <collab> tag and should be part of the Metadata implementation
318  /*if ($resultDOM->getElementsByTagName("CollectiveName")->length > 0 && $authorNode->getElementsByTagName("CollectiveName")->item(0)->textContent != '') {
319  }*/
320 
321  $metadata['person-group[@person-group-type="author"]'][] =& $authorDescription;
322  unset($authorDescription);
323  }
324 
325  // Extract pagination
326  $medlinePgnNodes = $resultDOM->getElementsByTagName('MedlinePgn');
327  $medlinePgnFirstNode = $medlinePgnNodes->item(0);
328  if (String::regexp_match_get("/^[:p\.\s]*(?P<fpage>[Ee]?\d+)(-(?P<lpage>\d+))?/", $medlinePgnFirstNode->textContent, $pages)) {
329  $fPage = (integer)$pages['fpage'];
330  $metadata['fpage'] = $fPage;
331  if (!empty($pages['lpage'])) {
332  $lPage = (integer)$pages['lpage'];
333 
334  // Deal with shortcuts like '382-7'
335  if ($lPage < $fPage) {
336  $lPage = (integer)(String::substr($pages['fpage'], 0, -String::strlen($pages['lpage'])).$pages['lpage']);
337  }
338 
339  $metadata['lpage'] = $lPage;
340  }
341  }
342 
343  // Get publication date (can be in several places in PubMed).
344  $dateNode = null;
345  $articleDateNodes = $resultDOM->getElementsByTagName('ArticleDate');
346  if ($articleDateNodes->length > 0) {
347  $dateNode = $articleDateNodes->item(0);
348  } else {
349  $pubDateNodes = $resultDOM->getElementsByTagName('PubDate');
350  if ($pubDateNodes->length > 0) {
351  $dateNode = $pubDateNodes->item(0);
352  }
353  }
354 
355  // Retrieve the data parts and assemble date.
356  if (!is_null($dateNode)) {
357  $publicationDate = '';
358  $requiresNormalization = false;
359  foreach(array('Year' => 4, 'Month' => 2, 'Day' => 2) as $dateElement => $padding) {
360  $dateElementNodes = $dateNode->getElementsByTagName($dateElement);
361  if ($dateElementNodes->length > 0) {
362  if (!empty($publicationDate)) $publicationDate.='-';
363  $dateElementFirstNode = $dateElementNodes->item(0);
364  $datePart = str_pad($dateElementFirstNode->textContent, $padding, '0', STR_PAD_LEFT);
365  if (!is_numeric($datePart)) $requiresNormalization = true;
366  $publicationDate .= $datePart;
367  } else {
368  break;
369  }
370  }
371 
372  // Normalize the date to NLM standard if necessary.
373  if ($requiresNormalization) {
374  $dateFilter = new DateStringNormalizerFilter();
375  $publicationDate = $dateFilter->execute($publicationDate);
376  }
377 
378  if (!empty($publicationDate)) $metadata['date'] = $publicationDate;
379  }
380 
381  // Get publication type
382  $publicationTypeNodes = $resultDOM->getElementsByTagName('PublicationType');
383  if ($publicationTypeNodes->length > 0) {
384  foreach($publicationTypeNodes as $publicationType) {
385  // The vast majority of items on PubMed are articles so catch these...
386  if (String::strpos(String::strtolower($publicationType->textContent), 'article') !== false) {
387  $metadata['[@publication-type]'] = NLM30_PUBLICATION_TYPE_JOURNAL;
388  break;
389  }
390  }
391  }
392 
393  // Get DOI if it exists
394  $articleIdNodes = $resultDOM->getElementsByTagName('ArticleId');
395  foreach ($articleIdNodes as $idNode) {
396  if ($idNode->getAttribute('IdType') == 'doi') {
397  $metadata['pub-id[@pub-id-type="doi"]'] = $idNode->textContent;
398  }
399  }
400 
401  // Use eLink utility to find fulltext links
402  $lookupParams = array(
403  'dbfrom' => 'pubmed',
404  'cmd' => 'llinks',
405  'tool' => 'pkp-wal',
406  'id' => $pmid
407  );
408  if(!is_null($resultDOM = $this->callWebService(PUBMED_WEBSERVICE_ELINK, $lookupParams))) {
409  // Get a list of possible links
410  foreach ($resultDOM->getElementsByTagName("ObjUrl") as $linkOut) {
411  $attributes = '';
412  foreach ($linkOut->getElementsByTagName("Attribute") as $attribute) $attributes .= String::strtolower($attribute->textContent).' / ';
413 
414  // Only add links to open access resources
415  if (String::strpos($attributes, "subscription") === false && String::strpos($attributes, "membership") === false &&
416  String::strpos($attributes, "fee") === false && $attributes != "") {
417  $urlNodes = $linkOut->getElementsByTagName('Url');
418  $urlFirstNode = $urlNodes->item(0);
419  $links[] = $urlFirstNode->textContent;
420  }
421  }
422 
423  // Take the first link if we have any left (presumably pubmed returns them in preferential order)
424  if (isset($links[0])) $metadata['uri'] = $links[0];
425  }
426 
427  return $this->getNlm30CitationDescriptionFromMetadataArray($metadata);
428  }
429 }
430 ?>
static strlen($string)
Definition: String.inc.php:137
static substr($string, $start, $length=false)
Definition: String.inc.php:187
static trimPunctuation($string)
Definition: String.inc.php:685
static strpos($haystack, $needle, $offset=0)
Definition: String.inc.php:154
setDisplayName($displayName)
Definition: Filter.inc.php:140
& getData($key, $locale=null)
static regexp_match_get($pattern, $subject, &$matches)
Definition: String.inc.php:351
static strtolower($string)
Definition: String.inc.php:238
Filter that converts from NLM name to a string.
Filter that normalizes a date string to YYYY[-MM[-DD]].
Abstract base class for all filters that transform NLM citation metadata descriptions.
Filter that uses the Pubmed web service to identify a PMID and corresponding meta-data for a given NL...
Class that describes a configurable filter setting which must be an email.
Class modeling a description (DCMI abstract model) or subject- predicate-object graph (RDF)...
& getNlm30CitationDescriptionFromMetadataArray(&$metadataArray)
& callWebService($url, &$params, $returnType=XSL_TRANSFORMER_DOCTYPE_DOM, $method= 'GET')