Open Monograph Press  3.3.0
XMLParser.php
1 <?php
2 
3 namespace PhpXmlRpc\Helper;
4 
7 
11 class XMLParser
12 {
13  // used to store state during parsing
14  // quick explanation of components:
15  // ac - used to accumulate values
16  // stack - array with genealogy of xml elements names:
17  // used to validate nesting of xmlrpc elements
18  // valuestack - array used for parsing arrays and structs
19  // lv - used to indicate "looking for a value": implements
20  // the logic to allow values with no types to be strings
21  // isf - used to indicate a parsing fault (2) or xmlrpc response fault (1)
22  // isf_reason - used for storing xmlrpc response fault string
23  // method - used to store method name
24  // params - used to store parameters in method calls
25  // pt - used to store the type of each received parameter. Useful if parameters are automatically decoded to php values
26  // rt - 'methodcall or 'methodresponse'
27  public $_xh = array(
28  'ac' => '',
29  'stack' => array(),
30  'valuestack' => array(),
31  'isf' => 0,
32  'isf_reason' => '',
33  'method' => false, // so we can check later if we got a methodname or not
34  'params' => array(),
35  'pt' => array(),
36  'rt' => '',
37  );
38 
39  public $xmlrpc_valid_parents = array(
40  'VALUE' => array('MEMBER', 'DATA', 'PARAM', 'FAULT'),
41  'BOOLEAN' => array('VALUE'),
42  'I4' => array('VALUE'),
43  'INT' => array('VALUE'),
44  'STRING' => array('VALUE'),
45  'DOUBLE' => array('VALUE'),
46  'DATETIME.ISO8601' => array('VALUE'),
47  'BASE64' => array('VALUE'),
48  'MEMBER' => array('STRUCT'),
49  'NAME' => array('MEMBER'),
50  'DATA' => array('ARRAY'),
51  'ARRAY' => array('VALUE'),
52  'STRUCT' => array('VALUE'),
53  'PARAM' => array('PARAMS'),
54  'METHODNAME' => array('METHODCALL'),
55  'PARAMS' => array('METHODCALL', 'METHODRESPONSE'),
56  'FAULT' => array('METHODRESPONSE'),
57  'NIL' => array('VALUE'), // only used when extension activated
58  'EX:NIL' => array('VALUE'), // only used when extension activated
59  );
60 
64  public function xmlrpc_se($parser, $name, $attrs, $acceptSingleVals = false)
65  {
66  // if invalid xmlrpc already detected, skip all processing
67  if ($this->_xh['isf'] < 2) {
68  // check for correct element nesting
69  // top level element can only be of 2 types
72  if (count($this->_xh['stack']) == 0) {
73  if ($name != 'METHODRESPONSE' && $name != 'METHODCALL' && (
74  $name != 'VALUE' && !$acceptSingleVals)
75  ) {
76  $this->_xh['isf'] = 2;
77  $this->_xh['isf_reason'] = 'missing top level xmlrpc element';
78 
79  return;
80  } else {
81  $this->_xh['rt'] = strtolower($name);
82  }
83  } else {
84  // not top level element: see if parent is OK
85  $parent = end($this->_xh['stack']);
86  if (!array_key_exists($name, $this->xmlrpc_valid_parents) || !in_array($parent, $this->xmlrpc_valid_parents[$name])) {
87  $this->_xh['isf'] = 2;
88  $this->_xh['isf_reason'] = "xmlrpc element $name cannot be child of $parent";
89 
90  return;
91  }
92  }
93 
94  switch ($name) {
95  // optimize for speed switch cases: most common cases first
96  case 'VALUE':
98  $this->_xh['vt'] = 'value'; // indicator: no value found yet
99  $this->_xh['ac'] = '';
100  $this->_xh['lv'] = 1;
101  $this->_xh['php_class'] = null;
102  break;
103  case 'I4':
104  case 'INT':
105  case 'STRING':
106  case 'BOOLEAN':
107  case 'DOUBLE':
108  case 'DATETIME.ISO8601':
109  case 'BASE64':
110  if ($this->_xh['vt'] != 'value') {
111  //two data elements inside a value: an error occurred!
112  $this->_xh['isf'] = 2;
113  $this->_xh['isf_reason'] = "$name element following a {$this->_xh['vt']} element inside a single value";
114 
115  return;
116  }
117  $this->_xh['ac'] = ''; // reset the accumulator
118  break;
119  case 'STRUCT':
120  case 'ARRAY':
121  if ($this->_xh['vt'] != 'value') {
122  //two data elements inside a value: an error occurred!
123  $this->_xh['isf'] = 2;
124  $this->_xh['isf_reason'] = "$name element following a {$this->_xh['vt']} element inside a single value";
125 
126  return;
127  }
128  // create an empty array to hold child values, and push it onto appropriate stack
129  $curVal = array();
130  $curVal['values'] = array();
131  $curVal['type'] = $name;
132  // check for out-of-band information to rebuild php objs
133  // and in case it is found, save it
134  if (@isset($attrs['PHP_CLASS'])) {
135  $curVal['php_class'] = $attrs['PHP_CLASS'];
136  }
137  $this->_xh['valuestack'][] = $curVal;
138  $this->_xh['vt'] = 'data'; // be prepared for a data element next
139  break;
140  case 'DATA':
141  if ($this->_xh['vt'] != 'data') {
142  //two data elements inside a value: an error occurred!
143  $this->_xh['isf'] = 2;
144  $this->_xh['isf_reason'] = "found two data elements inside an array element";
145 
146  return;
147  }
148  case 'METHODCALL':
149  case 'METHODRESPONSE':
150  case 'PARAMS':
151  // valid elements that add little to processing
152  break;
153  case 'METHODNAME':
154  case 'NAME':
156  $this->_xh['ac'] = '';
157  break;
158  case 'FAULT':
159  $this->_xh['isf'] = 1;
160  break;
161  case 'MEMBER':
162  $this->_xh['valuestack'][count($this->_xh['valuestack']) - 1]['name'] = ''; // set member name to null, in case we do not find in the xml later on
163  //$this->_xh['ac']='';
164  // Drop trough intentionally
165  case 'PARAM':
166  // clear value type, so we can check later if no value has been passed for this param/member
167  $this->_xh['vt'] = null;
168  break;
169  case 'NIL':
170  case 'EX:NIL':
172  if ($this->_xh['vt'] != 'value') {
173  //two data elements inside a value: an error occurred!
174  $this->_xh['isf'] = 2;
175  $this->_xh['isf_reason'] = "$name element following a {$this->_xh['vt']} element inside a single value";
176 
177  return;
178  }
179  $this->_xh['ac'] = ''; // reset the accumulator
180  break;
181  }
182  // we do not support the <NIL/> extension, so
183  // drop through intentionally
184  default:
186  $this->_xh['isf'] = 2;
187  $this->_xh['isf_reason'] = "found not-xmlrpc xml element $name";
188  break;
189  }
190 
191  // Save current element name to stack, to validate nesting
192  $this->_xh['stack'][] = $name;
193 
195  if ($name != 'VALUE') {
196  $this->_xh['lv'] = 0;
197  }
198  }
199  }
200 
204  public function xmlrpc_se_any($parser, $name, $attrs)
205  {
206  $this->xmlrpc_se($parser, $name, $attrs, true);
207  }
208 
212  public function xmlrpc_ee($parser, $name, $rebuildXmlrpcvals = true)
213  {
214  if ($this->_xh['isf'] < 2) {
215  // push this element name from stack
216  // NB: if XML validates, correct opening/closing is guaranteed and
217  // we do not have to check for $name == $currElem.
218  // we also checked for proper nesting at start of elements...
219  $currElem = array_pop($this->_xh['stack']);
220 
221  switch ($name) {
222  case 'VALUE':
223  // This if() detects if no scalar was inside <VALUE></VALUE>
224  if ($this->_xh['vt'] == 'value') {
225  $this->_xh['value'] = $this->_xh['ac'];
226  $this->_xh['vt'] = Value::$xmlrpcString;
227  }
228 
229  if ($rebuildXmlrpcvals) {
230  // build the xmlrpc val out of the data received, and substitute it
231  $temp = new Value($this->_xh['value'], $this->_xh['vt']);
232  // in case we got info about underlying php class, save it
233  // in the object we're rebuilding
234  if (isset($this->_xh['php_class'])) {
235  $temp->_php_class = $this->_xh['php_class'];
236  }
237  // check if we are inside an array or struct:
238  // if value just built is inside an array, let's move it into array on the stack
239  $vscount = count($this->_xh['valuestack']);
240  if ($vscount && $this->_xh['valuestack'][$vscount - 1]['type'] == 'ARRAY') {
241  $this->_xh['valuestack'][$vscount - 1]['values'][] = $temp;
242  } else {
243  $this->_xh['value'] = $temp;
244  }
245  } else {
249  if (isset($this->_xh['php_class'])) {
250  }
251 
252  // check if we are inside an array or struct:
253  // if value just built is inside an array, let's move it into array on the stack
254  $vscount = count($this->_xh['valuestack']);
255  if ($vscount && $this->_xh['valuestack'][$vscount - 1]['type'] == 'ARRAY') {
256  $this->_xh['valuestack'][$vscount - 1]['values'][] = $this->_xh['value'];
257  }
258  }
259  break;
260  case 'BOOLEAN':
261  case 'I4':
262  case 'INT':
263  case 'STRING':
264  case 'DOUBLE':
265  case 'DATETIME.ISO8601':
266  case 'BASE64':
267  $this->_xh['vt'] = strtolower($name);
270  if ($name == 'STRING') {
271  $this->_xh['value'] = $this->_xh['ac'];
272  } elseif ($name == 'DATETIME.ISO8601') {
273  if (!preg_match('/^[0-9]{8}T[0-9]{2}:[0-9]{2}:[0-9]{2}$/', $this->_xh['ac'])) {
274  error_log('XML-RPC: ' . __METHOD__ . ': invalid value received in DATETIME: ' . $this->_xh['ac']);
275  }
276  $this->_xh['vt'] = Value::$xmlrpcDateTime;
277  $this->_xh['value'] = $this->_xh['ac'];
278  } elseif ($name == 'BASE64') {
280  $this->_xh['value'] = base64_decode($this->_xh['ac']);
281  } elseif ($name == 'BOOLEAN') {
282  // special case here: we translate boolean 1 or 0 into PHP
283  // constants true or false.
284  // Strings 'true' and 'false' are accepted, even though the
285  // spec never mentions them (see eg. Blogger api docs)
286  // NB: this simple checks helps a lot sanitizing input, ie no
287  // security problems around here
288  if ($this->_xh['ac'] == '1' || strcasecmp($this->_xh['ac'], 'true') == 0) {
289  $this->_xh['value'] = true;
290  } else {
291  // log if receiving something strange, even though we set the value to false anyway
292  if ($this->_xh['ac'] != '0' && strcasecmp($this->_xh['ac'], 'false') != 0) {
293  error_log('XML-RPC: ' . __METHOD__ . ': invalid value received in BOOLEAN: ' . $this->_xh['ac']);
294  }
295  $this->_xh['value'] = false;
296  }
297  } elseif ($name == 'DOUBLE') {
298  // we have a DOUBLE
299  // we must check that only 0123456789-.<space> are characters here
300  // NOTE: regexp could be much stricter than this...
301  if (!preg_match('/^[+-eE0123456789 \t.]+$/', $this->_xh['ac'])) {
303  error_log('XML-RPC: ' . __METHOD__ . ': non numeric value received in DOUBLE: ' . $this->_xh['ac']);
304  $this->_xh['value'] = 'ERROR_NON_NUMERIC_FOUND';
305  } else {
306  // it's ok, add it on
307  $this->_xh['value'] = (double)$this->_xh['ac'];
308  }
309  } else {
310  // we have an I4/INT
311  // we must check that only 0123456789-<space> are characters here
312  if (!preg_match('/^[+-]?[0123456789 \t]+$/', $this->_xh['ac'])) {
314  error_log('XML-RPC: ' . __METHOD__ . ': non numeric value received in INT: ' . $this->_xh['ac']);
315  $this->_xh['value'] = 'ERROR_NON_NUMERIC_FOUND';
316  } else {
317  // it's ok, add it on
318  $this->_xh['value'] = (int)$this->_xh['ac'];
319  }
320  }
321  $this->_xh['lv'] = 3; // indicate we've found a value
322  break;
323  case 'NAME':
324  $this->_xh['valuestack'][count($this->_xh['valuestack']) - 1]['name'] = $this->_xh['ac'];
325  break;
326  case 'MEMBER':
327  // add to array in the stack the last element built,
328  // unless no VALUE was found
329  if ($this->_xh['vt']) {
330  $vscount = count($this->_xh['valuestack']);
331  $this->_xh['valuestack'][$vscount - 1]['values'][$this->_xh['valuestack'][$vscount - 1]['name']] = $this->_xh['value'];
332  } else {
333  error_log('XML-RPC: ' . __METHOD__ . ': missing VALUE inside STRUCT in received xml');
334  }
335  break;
336  case 'DATA':
337  $this->_xh['vt'] = null; // reset this to check for 2 data elements in a row - even if they're empty
338  break;
339  case 'STRUCT':
340  case 'ARRAY':
341  // fetch out of stack array of values, and promote it to current value
342  $currVal = array_pop($this->_xh['valuestack']);
343  $this->_xh['value'] = $currVal['values'];
344  $this->_xh['vt'] = strtolower($name);
345  if (isset($currVal['php_class'])) {
346  $this->_xh['php_class'] = $currVal['php_class'];
347  }
348  break;
349  case 'PARAM':
350  // add to array of params the current value,
351  // unless no VALUE was found
352  if ($this->_xh['vt']) {
353  $this->_xh['params'][] = $this->_xh['value'];
354  $this->_xh['pt'][] = $this->_xh['vt'];
355  } else {
356  error_log('XML-RPC: ' . __METHOD__ . ': missing VALUE inside PARAM in received xml');
357  }
358  break;
359  case 'METHODNAME':
360  $this->_xh['method'] = preg_replace('/^[\n\r\t ]+/', '', $this->_xh['ac']);
361  break;
362  case 'NIL':
363  case 'EX:NIL':
365  $this->_xh['vt'] = 'null';
366  $this->_xh['value'] = null;
367  $this->_xh['lv'] = 3;
368  break;
369  }
370  // drop through intentionally if nil extension not enabled
371  case 'PARAMS':
372  case 'FAULT':
373  case 'METHODCALL':
374  case 'METHORESPONSE':
375  break;
376  default:
377  // End of INVALID ELEMENT!
378  // shall we add an assert here for unreachable code???
379  break;
380  }
381  }
382  }
383 
387  public function xmlrpc_ee_fast($parser, $name)
388  {
389  $this->xmlrpc_ee($parser, $name, false);
390  }
391 
395  public function xmlrpc_cd($parser, $data)
396  {
397  // skip processing if xml fault already detected
398  if ($this->_xh['isf'] < 2) {
399  // "lookforvalue==3" means that we've found an entire value
400  // and should discard any further character data
401  if ($this->_xh['lv'] != 3) {
402  $this->_xh['ac'] .= $data;
403  }
404  }
405  }
406 
411  public function xmlrpc_dh($parser, $data)
412  {
413  // skip processing if xml fault already detected
414  if ($this->_xh['isf'] < 2) {
415  if (substr($data, 0, 1) == '&' && substr($data, -1, 1) == ';') {
416  $this->_xh['ac'] .= $data;
417  }
418  }
419 
420  return true;
421  }
422 
444  public static function guessEncoding($httpHeader = '', $xmlChunk = '', $encodingPrefs = null)
445  {
446  // discussion: see http://www.yale.edu/pclt/encoding/
447  // 1 - test if encoding is specified in HTTP HEADERS
448 
449  // Details:
450  // LWS: (\13\10)?( |\t)+
451  // token: (any char but excluded stuff)+
452  // quoted string: " (any char but double quotes and control chars)* "
453  // header: Content-type = ...; charset=value(; ...)*
454  // where value is of type token, no LWS allowed between 'charset' and value
455  // Note: we do not check for invalid chars in VALUE:
456  // this had better be done using pure ereg as below
457  // Note 2: we might be removing whitespace/tabs that ought to be left in if
458  // the received charset is a quoted string. But nobody uses such charset names...
459 
461  $matches = array();
462  if (preg_match('/;\s*charset\s*=([^;]+)/i', $httpHeader, $matches)) {
463  return strtoupper(trim($matches[1], " \t\""));
464  }
465 
466  // 2 - scan the first bytes of the data for a UTF-16 (or other) BOM pattern
467  // (source: http://www.w3.org/TR/2000/REC-xml-20001006)
468  // NOTE: actually, according to the spec, even if we find the BOM and determine
469  // an encoding, we should check if there is an encoding specified
470  // in the xml declaration, and verify if they match.
473  if (preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlChunk)) {
474  return 'UCS-4';
475  } elseif (preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlChunk)) {
476  return 'UTF-16';
477  } elseif (preg_match('/^(\xEF\xBB\xBF)/', $xmlChunk)) {
478  return 'UTF-8';
479  }
480 
481  // 3 - test if encoding is specified in the xml declaration
482  // Details:
483  // SPACE: (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+
484  // EQ: SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]*
485  if (preg_match('/^<\?xml\s+version\s*=\s*' . "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))" .
486  '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/",
487  $xmlChunk, $matches)) {
488  return strtoupper(substr($matches[2], 1, -1));
489  }
490 
491  // 4 - if mbstring is available, let it do the guesswork
492  if (extension_loaded('mbstring')) {
493  if ($encodingPrefs == null && PhpXmlRpc::$xmlrpc_detectencodings != null) {
494  $encodingPrefs = PhpXmlRpc::$xmlrpc_detectencodings;
495  }
496  if ($encodingPrefs) {
497  $enc = mb_detect_encoding($xmlChunk, $encodingPrefs);
498  } else {
499  $enc = mb_detect_encoding($xmlChunk);
500  }
501  // NB: mb_detect likes to call it ascii, xml parser likes to call it US_ASCII...
502  // IANA also likes better US-ASCII, so go with it
503  if ($enc == 'ASCII') {
504  $enc = 'US-' . $enc;
505  }
506 
507  return $enc;
508  } else {
509  // no encoding specified: as per HTTP1.1 assume it is iso-8859-1?
510  // Both RFC 2616 (HTTP 1.1) and 1945 (HTTP 1.0) clearly state that for text/xxx content types
511  // this should be the standard. And we should be getting text/xml as request and response.
512  // BUT we have to be backward compatible with the lib, which always used UTF-8 as default...
514  }
515  }
516 
523  public static function hasEncoding($xmlChunk)
524  {
525  // scan the first bytes of the data for a UTF-16 (or other) BOM pattern
526  // (source: http://www.w3.org/TR/2000/REC-xml-20001006)
527  if (preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlChunk)) {
528  return true;
529  } elseif (preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlChunk)) {
530  return true;
531  } elseif (preg_match('/^(\xEF\xBB\xBF)/', $xmlChunk)) {
532  return true;
533  }
534 
535  // test if encoding is specified in the xml declaration
536  // Details:
537  // SPACE: (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+
538  // EQ: SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]*
539  if (preg_match('/^<\?xml\s+version\s*=\s*' . "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))" .
540  '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/",
541  $xmlChunk, $matches)) {
542  return true;
543  }
544 
545  return false;
546  }
547 }
PhpXmlRpc\Helper\XMLParser\hasEncoding
static hasEncoding($xmlChunk)
Definition: XMLParser.php:523
PhpXmlRpc\Helper
Definition: Charset.php:3
PhpXmlRpc\Value\$xmlrpcDateTime
static $xmlrpcDateTime
Definition: Value.php:14
PhpXmlRpc\Helper\XMLParser\guessEncoding
static guessEncoding($httpHeader='', $xmlChunk='', $encodingPrefs=null)
Definition: XMLParser.php:444
PhpXmlRpc\Helper\XMLParser\xmlrpc_se
xmlrpc_se($parser, $name, $attrs, $acceptSingleVals=false)
Definition: XMLParser.php:64
PhpXmlRpc\Helper\XMLParser\xmlrpc_ee_fast
xmlrpc_ee_fast($parser, $name)
Definition: XMLParser.php:387
PhpXmlRpc\Helper\XMLParser\$_xh
$_xh
Definition: XMLParser.php:27
PhpXmlRpc\PhpXmlRpc\$xmlrpc_null_extension
static $xmlrpc_null_extension
Definition: PhpXmlRpc.php:87
PhpXmlRpc\Value
Definition: Value.php:7
PhpXmlRpc\Helper\XMLParser\xmlrpc_cd
xmlrpc_cd($parser, $data)
Definition: XMLParser.php:395
PhpXmlRpc\Value\$xmlrpcString
static $xmlrpcString
Definition: Value.php:13
PhpXmlRpc\Helper\XMLParser\xmlrpc_se_any
xmlrpc_se_any($parser, $name, $attrs)
Definition: XMLParser.php:204
PhpXmlRpc\Helper\XMLParser\xmlrpc_dh
xmlrpc_dh($parser, $data)
Definition: XMLParser.php:411
PhpXmlRpc\Helper\XMLParser
Definition: XMLParser.php:11
PhpXmlRpc\PhpXmlRpc
Definition: PhpXmlRpc.php:5
PhpXmlRpc\PhpXmlRpc\$xmlrpc_defencoding
static $xmlrpc_defencoding
Definition: PhpXmlRpc.php:64
PhpXmlRpc\PhpXmlRpc\$xmlrpc_detectencodings
static $xmlrpc_detectencodings
Definition: PhpXmlRpc.php:70
PhpXmlRpc\Helper\XMLParser\$xmlrpc_valid_parents
$xmlrpc_valid_parents
Definition: XMLParser.php:39
PhpXmlRpc\Helper\XMLParser\xmlrpc_ee
xmlrpc_ee($parser, $name, $rebuildXmlrpcvals=true)
Definition: XMLParser.php:212