2D Api Source Code

 index.php

----------------

<?php


require('stmm_html_dom.php');

$html=file_get_html('https://marketdata.set.or.th/mkt/marketsummary.do?language=th&country=TH');

$i=0;

$mArray=array();

foreach($html->find('td') as $e){

     array_push($mArray,$e->innertext);

     $i++;

     if($i>7){

         break;

     }

}


$first=str_split($mArray[1]);

$second=str_split($mArray[7]);

$pointPos=strpos($mArray[7],'.');


$result=array();

$result["RESULT"]=$first[count($first)-1].''.$second[$pointPos-1];

$result["SET"]=$mArray[1];

$result["VALUE"]=$mArray[7];


echo json_encode($result);


?>


stmm_html_dom.php

-------------------------

Download Here


<?php

/**

 * Website: http://sourceforge.net/projects/simplehtmldom/

 * Additional projects: http://sourceforge.net/projects/debugobject/

 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)

 *

 * Licensed under The MIT License

 * See the LICENSE file in the project root for more information.

 *

 * Authors:

 *   S.C. Chen

 *   John Schlick

 *   Rus Carroll

 *   logmanoriginal

 *

 * Contributors:

 *   Yousuke Kumakura

 *   Vadim Voituk

 *   Antcs

 *

 * Version Rev. 1.9.1 (291)

 */


define('HDOM_TYPE_ELEMENT', 1);

define('HDOM_TYPE_COMMENT', 2);

define('HDOM_TYPE_TEXT', 3);

define('HDOM_TYPE_ENDTAG', 4);

define('HDOM_TYPE_ROOT', 5);

define('HDOM_TYPE_UNKNOWN', 6);

define('HDOM_QUOTE_DOUBLE', 0);

define('HDOM_QUOTE_SINGLE', 1);

define('HDOM_QUOTE_NO', 3);

define('HDOM_INFO_BEGIN', 0);

define('HDOM_INFO_END', 1);

define('HDOM_INFO_QUOTE', 2);

define('HDOM_INFO_SPACE', 3);

define('HDOM_INFO_TEXT', 4);

define('HDOM_INFO_INNER', 5);

define('HDOM_INFO_OUTER', 6);

define('HDOM_INFO_ENDSPACE', 7);


defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');

defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");

defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');

defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);

define('HDOM_SMARTY_AS_TEXT', 1);


function file_get_html(

$url,

$use_include_path = false,

$context = null,

$offset = 0,

$maxLen = -1,

$lowercase = true,

$forceTagsClosed = true,

$target_charset = DEFAULT_TARGET_CHARSET,

$stripRN = true,

$defaultBRText = DEFAULT_BR_TEXT,

$defaultSpanText = DEFAULT_SPAN_TEXT)

{

if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }


$dom = new simple_html_dom(

null,

$lowercase,

$forceTagsClosed,

$target_charset,

$stripRN,

$defaultBRText,

$defaultSpanText

);


/**

* For sourceforge users: uncomment the next line and comment the

* retrieve_url_contents line 2 lines down if it is not already done.

*/

$contents = file_get_contents(

$url,

$use_include_path,

$context,

$offset,

$maxLen

);

// $contents = retrieve_url_contents($url);


if (empty($contents) || strlen($contents) > $maxLen) {

$dom->clear();

return false;

}


return $dom->load($contents, $lowercase, $stripRN);

}


function str_get_html(

$str,

$lowercase = true,

$forceTagsClosed = true,

$target_charset = DEFAULT_TARGET_CHARSET,

$stripRN = true,

$defaultBRText = DEFAULT_BR_TEXT,

$defaultSpanText = DEFAULT_SPAN_TEXT)

{

$dom = new simple_html_dom(

null,

$lowercase,

$forceTagsClosed,

$target_charset,

$stripRN,

$defaultBRText,

$defaultSpanText

);


if (empty($str) || strlen($str) > MAX_FILE_SIZE) {

$dom->clear();

return false;

}


return $dom->load($str, $lowercase, $stripRN);

}


function dump_html_tree($node, $show_attr = true, $deep = 0)

{

$node->dump($node);

}


class simple_html_dom_node

{

public $nodetype = HDOM_TYPE_TEXT;

public $tag = 'text';

public $attr = array();

public $children = array();

public $nodes = array();

public $parent = null;

public $_ = array();

public $tag_start = 0;

private $dom = null;


function __construct($dom)

{

$this->dom = $dom;

$dom->nodes[] = $this;

}


function __destruct()

{

$this->clear();

}


function __toString()

{

return $this->outertext();

}


function clear()

{

$this->dom = null;

$this->nodes = null;

$this->parent = null;

$this->children = null;

}


function dump($show_attr = true, $depth = 0)

{

echo str_repeat("\t", $depth) . $this->tag;


if ($show_attr && count($this->attr) > 0) {

echo '(';

foreach ($this->attr as $k => $v) {

echo "[$k]=>\"$v\", ";

}

echo ')';

}


echo "\n";


if ($this->nodes) {

foreach ($this->nodes as $node) {

$node->dump($show_attr, $depth + 1);

}

}

}


function dump_node($echo = true)

{

$string = $this->tag;


if (count($this->attr) > 0) {

$string .= '(';

foreach ($this->attr as $k => $v) {

$string .= "[$k]=>\"$v\", ";

}

$string .= ')';

}


if (count($this->_) > 0) {

$string .= ' $_ (';

foreach ($this->_ as $k => $v) {

if (is_array($v)) {

$string .= "[$k]=>(";

foreach ($v as $k2 => $v2) {

$string .= "[$k2]=>\"$v2\", ";

}

$string .= ')';

} else {

$string .= "[$k]=>\"$v\", ";

}

}

$string .= ')';

}


if (isset($this->text)) {

$string .= " text: ({$this->text})";

}


$string .= ' HDOM_INNER_INFO: ';


if (isset($node->_[HDOM_INFO_INNER])) {

$string .= "'" . $node->_[HDOM_INFO_INNER] . "'";

} else {

$string .= ' NULL ';

}


$string .= ' children: ' . count($this->children);

$string .= ' nodes: ' . count($this->nodes);

$string .= ' tag_start: ' . $this->tag_start;

$string .= "\n";


if ($echo) {

echo $string;

return;

} else {

return $string;

}

}


function parent($parent = null)

{

// I am SURE that this doesn't work properly.

// It fails to unset the current node from it's current parents nodes or

// children list first.

if ($parent !== null) {

$this->parent = $parent;

$this->parent->nodes[] = $this;

$this->parent->children[] = $this;

}


return $this->parent;

}


function has_child()

{

return !empty($this->children);

}


function children($idx = -1)

{

if ($idx === -1) {

return $this->children;

}


if (isset($this->children[$idx])) {

return $this->children[$idx];

}


return null;

}


function first_child()

{

if (count($this->children) > 0) {

return $this->children[0];

}

return null;

}


function last_child()

{

if (count($this->children) > 0) {

return end($this->children);

}

return null;

}


function next_sibling()

{

if ($this->parent === null) {

return null;

}


$idx = array_search($this, $this->parent->children, true);


if ($idx !== false && isset($this->parent->children[$idx + 1])) {

return $this->parent->children[$idx + 1];

}


return null;

}


function prev_sibling()

{

if ($this->parent === null) {

return null;

}


$idx = array_search($this, $this->parent->children, true);


if ($idx !== false && $idx > 0) {

return $this->parent->children[$idx - 1];

}


return null;

}


function find_ancestor_tag($tag)

{

global $debug_object;

if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }


if ($this->parent === null) {

return null;

}


$ancestor = $this->parent;


while (!is_null($ancestor)) {

if (is_object($debug_object)) {

$debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag);

}


if ($ancestor->tag === $tag) {

break;

}


$ancestor = $ancestor->parent;

}


return $ancestor;

}


function innertext()

{

if (isset($this->_[HDOM_INFO_INNER])) {

return $this->_[HDOM_INFO_INNER];

}


if (isset($this->_[HDOM_INFO_TEXT])) {

return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);

}


$ret = '';


foreach ($this->nodes as $n) {

$ret .= $n->outertext();

}


return $ret;

}


function outertext()

{

global $debug_object;


if (is_object($debug_object)) {

$text = '';


if ($this->tag === 'text') {

if (!empty($this->text)) {

$text = ' with text: ' . $this->text;

}

}


$debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);

}


if ($this->tag === 'root') {

return $this->innertext();

}


// todo: What is the use of this callback? Remove?

if ($this->dom && $this->dom->callback !== null) {

call_user_func_array($this->dom->callback, array($this));

}


if (isset($this->_[HDOM_INFO_OUTER])) {

return $this->_[HDOM_INFO_OUTER];

}


if (isset($this->_[HDOM_INFO_TEXT])) {

return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);

}


$ret = '';


if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {

$ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();

}


if (isset($this->_[HDOM_INFO_INNER])) {

// todo: <br> should either never have HDOM_INFO_INNER or always

if ($this->tag !== 'br') {

$ret .= $this->_[HDOM_INFO_INNER];

}

} elseif ($this->nodes) {

foreach ($this->nodes as $n) {

$ret .= $this->convert_text($n->outertext());

}

}


if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {

$ret .= '</' . $this->tag . '>';

}


return $ret;

}


function text()

{

if (isset($this->_[HDOM_INFO_INNER])) {

return $this->_[HDOM_INFO_INNER];

}


switch ($this->nodetype) {

case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);

case HDOM_TYPE_COMMENT: return '';

case HDOM_TYPE_UNKNOWN: return '';

}


if (strcasecmp($this->tag, 'script') === 0) { return ''; }

if (strcasecmp($this->tag, 'style') === 0) { return ''; }


$ret = '';


// In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed

// for some span tags, and some p tags) $this->nodes is set to NULL.

// NOTE: This indicates that there is a problem where it's set to NULL

// without a clear happening.

// WHY is this happening?

if (!is_null($this->nodes)) {

foreach ($this->nodes as $n) {

// Start paragraph after a blank line

if ($n->tag === 'p') {

$ret = trim($ret) . "\n\n";

}


$ret .= $this->convert_text($n->text());


// If this node is a span... add a space at the end of it so

// multiple spans don't run into each other.  This is plaintext

// after all.

if ($n->tag === 'span') {

$ret .= $this->dom->default_span_text;

}

}

}

return $ret;

}


function xmltext()

{

$ret = $this->innertext();

$ret = str_ireplace('<![CDATA[', '', $ret);

$ret = str_replace(']]>', '', $ret);

return $ret;

}


function makeup()

{

// text, comment, unknown

if (isset($this->_[HDOM_INFO_TEXT])) {

return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);

}


$ret = '<' . $this->tag;

$i = -1;


foreach ($this->attr as $key => $val) {

++$i;


// skip removed attribute

if ($val === null || $val === false) { continue; }


$ret .= $this->_[HDOM_INFO_SPACE][$i][0];


//no value attr: nowrap, checked selected...

if ($val === true) {

$ret .= $key;

} else {

switch ($this->_[HDOM_INFO_QUOTE][$i])

{

case HDOM_QUOTE_DOUBLE: $quote = '"'; break;

case HDOM_QUOTE_SINGLE: $quote = '\''; break;

default: $quote = '';

}


$ret .= $key

. $this->_[HDOM_INFO_SPACE][$i][1]

. '='

. $this->_[HDOM_INFO_SPACE][$i][2]

. $quote

. $val

. $quote;

}

}


$ret = $this->dom->restore_noise($ret);

return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';

}


function find($selector, $idx = null, $lowercase = false)

{

$selectors = $this->parse_selector($selector);

if (($count = count($selectors)) === 0) { return array(); }

$found_keys = array();


// find each selector

for ($c = 0; $c < $count; ++$c) {

// The change on the below line was documented on the sourceforge

// code tracker id 2788009

// used to be: if (($levle=count($selectors[0]))===0) return array();

if (($levle = count($selectors[$c])) === 0) { return array(); }

if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }


$head = array($this->_[HDOM_INFO_BEGIN] => 1);

$cmd = ' '; // Combinator


// handle descendant selectors, no recursive!

for ($l = 0; $l < $levle; ++$l) {

$ret = array();


foreach ($head as $k => $v) {

$n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];

//PaperG - Pass this optional parameter on to the seek function.

$n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);

}


$head = $ret;

$cmd = $selectors[$c][$l][4]; // Next Combinator

}


foreach ($head as $k => $v) {

if (!isset($found_keys[$k])) {

$found_keys[$k] = 1;

}

}

}


// sort keys

ksort($found_keys);


$found = array();

foreach ($found_keys as $k => $v) {

$found[] = $this->dom->nodes[$k];

}


// return nth-element or array

if (is_null($idx)) { return $found; }

elseif ($idx < 0) { $idx = count($found) + $idx; }

return (isset($found[$idx])) ? $found[$idx] : null;

}


protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)

{

global $debug_object;

if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }


list($tag, $id, $class, $attributes, $cmb) = $selector;

$nodes = array();


if ($parent_cmd === ' ') { // Descendant Combinator

// Find parent closing tag if the current element doesn't have a closing

// tag (i.e. void element)

$end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;

if ($end == 0) {

$parent = $this->parent;

while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {

$end -= 1;

$parent = $parent->parent;

}

$end += $parent->_[HDOM_INFO_END];

}


// Get list of target nodes

$nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;

$nodes_count = $end - $nodes_start;

$nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);

} elseif ($parent_cmd === '>') { // Child Combinator

$nodes = $this->children;

} elseif ($parent_cmd === '+'

&& $this->parent

&& in_array($this, $this->parent->children)) { // Next-Sibling Combinator

$index = array_search($this, $this->parent->children, true) + 1;

if ($index < count($this->parent->children))

$nodes[] = $this->parent->children[$index];

} elseif ($parent_cmd === '~'

&& $this->parent

&& in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator

$index = array_search($this, $this->parent->children, true);

$nodes = array_slice($this->parent->children, $index);

}


// Go throgh each element starting at this element until the end tag

// Note: If this element is a void tag, any previous void element is

// skipped.

foreach($nodes as $node) {

$pass = true;


// Skip root nodes

if(!$node->parent) {

$pass = false;

}


// Handle 'text' selector

if($pass && $tag === 'text' && $node->tag === 'text') {

$ret[array_search($node, $this->dom->nodes, true)] = 1;

unset($node);

continue;

}


// Skip if node isn't a child node (i.e. text nodes)

if($pass && !in_array($node, $node->parent->children, true)) {

$pass = false;

}


// Skip if tag doesn't match

if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {

$pass = false;

}


// Skip if ID doesn't exist

if ($pass && $id !== '' && !isset($node->attr['id'])) {

$pass = false;

}


// Check if ID matches

if ($pass && $id !== '' && isset($node->attr['id'])) {

// Note: Only consider the first ID (as browsers do)

$node_id = explode(' ', trim($node->attr['id']))[0];


if($id !== $node_id) { $pass = false; }

}


// Check if all class(es) exist

if ($pass && $class !== '' && is_array($class) && !empty($class)) {

if (isset($node->attr['class'])) {

$node_classes = explode(' ', $node->attr['class']);


if ($lowercase) {

$node_classes = array_map('strtolower', $node_classes);

}


foreach($class as $c) {

if(!in_array($c, $node_classes)) {

$pass = false;

break;

}

}

} else {

$pass = false;

}

}


// Check attributes

if ($pass

&& $attributes !== ''

&& is_array($attributes)

&& !empty($attributes)) {

foreach($attributes as $a) {

list (

$att_name,

$att_expr,

$att_val,

$att_inv,

$att_case_sensitivity

) = $a;


// Handle indexing attributes (i.e. "[2]")

/**

* Note: This is not supported by the CSS Standard but adds

* the ability to select items compatible to XPath (i.e.

* the 3rd element within it's parent).

*

* Note: This doesn't conflict with the CSS Standard which

* doesn't work on numeric attributes anyway.

*/

if (is_numeric($att_name)

&& $att_expr === ''

&& $att_val === '') {

$count = 0;


// Find index of current element in parent

foreach ($node->parent->children as $c) {

if ($c->tag === $node->tag) ++$count;

if ($c === $node) break;

}


// If this is the correct node, continue with next

// attribute

if ($count === (int)$att_name) continue;

}


// Check attribute availability

if ($att_inv) { // Attribute should NOT be set

if (isset($node->attr[$att_name])) {

$pass = false;

break;

}

} else { // Attribute should be set

// todo: "plaintext" is not a valid CSS selector!

if ($att_name !== 'plaintext'

&& !isset($node->attr[$att_name])) {

$pass = false;

break;

}

}


// Continue with next attribute if expression isn't defined

if ($att_expr === '') continue;


// If they have told us that this is a "plaintext"

// search then we want the plaintext of the node - right?

// todo "plaintext" is not a valid CSS selector!

if ($att_name === 'plaintext') {

$nodeKeyValue = $node->text();

} else {

$nodeKeyValue = $node->attr[$att_name];

}


if (is_object($debug_object)) {

$debug_object->debug_log(2,

'testing node: '

. $node->tag

. ' for attribute: '

. $att_name

. $att_expr

. $att_val

. ' where nodes value is: '

. $nodeKeyValue

);

}


// If lowercase is set, do a case insensitive test of

// the value of the selector.

if ($lowercase) {

$check = $this->match(

$att_expr,

strtolower($att_val),

strtolower($nodeKeyValue),

$att_case_sensitivity

);

} else {

$check = $this->match(

$att_expr,

$att_val,

$nodeKeyValue,

$att_case_sensitivity

);

}


if (is_object($debug_object)) {

$debug_object->debug_log(2,

'after match: '

. ($check ? 'true' : 'false')

);

}


if (!$check) {

$pass = false;

break;

}

}

}


// Found a match. Add to list and clear node

if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;

unset($node);

}

// It's passed by reference so this is actually what this function returns.

if (is_object($debug_object)) {

$debug_object->debug_log(1, 'EXIT - ret: ', $ret);

}

}


protected function match($exp, $pattern, $value, $case_sensitivity)

{

global $debug_object;

if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}


if ($case_sensitivity === 'i') {

$pattern = strtolower($pattern);

$value = strtolower($value);

}


switch ($exp) {

case '=':

return ($value === $pattern);

case '!=':

return ($value !== $pattern);

case '^=':

return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);

case '$=':

return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);

case '*=':

return preg_match('/' . preg_quote($pattern, '/') . '/', $value);

case '|=':

/**

* [att|=val]

*

* Represents an element with the att attribute, its value

* either being exactly "val" or beginning with "val"

* immediately followed by "-" (U+002D).

*/

return strpos($value, $pattern) === 0;

case '~=':

/**

* [att~=val]

*

* Represents an element with the att attribute whose value is a

* whitespace-separated list of words, one of which is exactly

* "val". If "val" contains whitespace, it will never represent

* anything (since the words are separated by spaces). Also if

* "val" is the empty string, it will never represent anything.

*/

return in_array($pattern, explode(' ', trim($value)), true);

}

return false;

}


protected function parse_selector($selector_string)

{

global $debug_object;

if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }


/**

* Pattern of CSS selectors, modified from mootools (https://mootools.net/)

*

* Paperg: Add the colon to the attribute, so that it properly finds

* <tag attr:ibute="something" > like google does.

*

* Note: if you try to look at this attribute, you MUST use getAttribute

* since $dom->x:y will fail the php syntax check.

*

* Notice the \[ starting the attribute? and the @? following? This

* implies that an attribute can begin with an @ sign that is not

* captured. This implies that an html attribute specifier may start

* with an @ sign that is NOT captured by the expression. Farther study

* is required to determine of this should be documented or removed.

*

* Matches selectors in this order:

*

* [0] - full match

*

* [1] - tag name

*     ([\w:\*-]*)

*     Matches the tag name consisting of zero or more words, colons,

*     asterisks and hyphens.

*

* [2] - id name

*     (?:\#([\w-]+))

*     Optionally matches a id name, consisting of an "#" followed by

*     the id name (one or more words and hyphens).

*

* [3] - class names (including dots)

*     (?:\.([\w\.-]+))?

*     Optionally matches a list of classs, consisting of an "."

*     followed by the class name (one or more words and hyphens)

*     where multiple classes can be chained (i.e. ".foo.bar.baz")

*

* [4] - attributes

*     ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?

*     Optionally matches the attributes list

*

* [5] - separator

*     ([\/, >+~]+)

*     Matches the selector list separator

*/

// phpcs:ignore Generic.Files.LineLength

$pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";


preg_match_all(

$pattern,

trim($selector_string) . ' ', // Add final ' ' as pseudo separator

$matches,

PREG_SET_ORDER

);


if (is_object($debug_object)) {

$debug_object->debug_log(2, 'Matches Array: ', $matches);

}


$selectors = array();

$result = array();


foreach ($matches as $m) {

$m[0] = trim($m[0]);


// Skip NoOps

if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }


// Convert to lowercase

if ($this->dom->lowercase) {

$m[1] = strtolower($m[1]);

}


// Extract classes

if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }


/* Extract attributes (pattern based on the pattern above!)


* [0] - full match

* [1] - attribute name

* [2] - attribute expression

* [3] - attribute value

* [4] - case sensitivity

*

* Note: Attributes can be negated with a "!" prefix to their name

*/

if($m[4] !== '') {

preg_match_all(

"/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is",

trim($m[4]),

$attributes,

PREG_SET_ORDER

);


// Replace element by array

$m[4] = array();


foreach($attributes as $att) {

// Skip empty matches

if(trim($att[0]) === '') { continue; }


$inverted = (isset($att[1][0]) && $att[1][0] === '!');

$m[4][] = array(

$inverted ? substr($att[1], 1) : $att[1], // Name

(isset($att[2])) ? $att[2] : '', // Expression

(isset($att[3])) ? $att[3] : '', // Value

$inverted, // Inverted Flag

(isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity

);

}

}


// Sanitize Separator

if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator

$m[5] = ' ';

} else { // Other Separator

$m[5] = trim($m[5]);

}


// Clear Separator if it's a Selector List

if ($is_list = ($m[5] === ',')) { $m[5] = ''; }


// Remove full match before adding to results

array_shift($m);

$result[] = $m;


if ($is_list) { // Selector List

$selectors[] = $result;

$result = array();

}

}


if (count($result) > 0) { $selectors[] = $result; }

return $selectors;

}


function __get($name)

{

if (isset($this->attr[$name])) {

return $this->convert_text($this->attr[$name]);

}

switch ($name) {

case 'outertext': return $this->outertext();

case 'innertext': return $this->innertext();

case 'plaintext': return $this->text();

case 'xmltext': return $this->xmltext();

default: return array_key_exists($name, $this->attr);

}

}


function __set($name, $value)

{

global $debug_object;

if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }


switch ($name) {

case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;

case 'innertext':

if (isset($this->_[HDOM_INFO_TEXT])) {

return $this->_[HDOM_INFO_TEXT] = $value;

}

return $this->_[HDOM_INFO_INNER] = $value;

}


if (!isset($this->attr[$name])) {

$this->_[HDOM_INFO_SPACE][] = array(' ', '', '');

$this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;

}


$this->attr[$name] = $value;

}


function __isset($name)

{

switch ($name) {

case 'outertext': return true;

case 'innertext': return true;

case 'plaintext': return true;

}

//no value attr: nowrap, checked selected...

return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);

}


function __unset($name)

{

if (isset($this->attr[$name])) { unset($this->attr[$name]); }

}


function convert_text($text)

{

global $debug_object;

if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }


$converted_text = $text;


$sourceCharset = '';

$targetCharset = '';


if ($this->dom) {

$sourceCharset = strtoupper($this->dom->_charset);

$targetCharset = strtoupper($this->dom->_target_charset);

}


if (is_object($debug_object)) {

$debug_object->debug_log(3,

'source charset: '

. $sourceCharset

. ' target charaset: '

. $targetCharset

);

}


if (!empty($sourceCharset)

&& !empty($targetCharset)

&& (strcasecmp($sourceCharset, $targetCharset) != 0)) {

// Check if the reported encoding could have been incorrect and the text is actually already UTF-8

if ((strcasecmp($targetCharset, 'UTF-8') == 0)

&& ($this->is_utf8($text))) {

$converted_text = $text;

} else {

$converted_text = iconv($sourceCharset, $targetCharset, $text);

}

}


// Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.

if ($targetCharset === 'UTF-8') {

if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {

$converted_text = substr($converted_text, 3);

}


if (substr($converted_text, -3) === "\xef\xbb\xbf") {

$converted_text = substr($converted_text, 0, -3);

}

}


return $converted_text;

}


static function is_utf8($str)

{

$c = 0; $b = 0;

$bits = 0;

$len = strlen($str);

for($i = 0; $i < $len; $i++) {

$c = ord($str[$i]);

if($c > 128) {

if(($c >= 254)) { return false; }

elseif($c >= 252) { $bits = 6; }

elseif($c >= 248) { $bits = 5; }

elseif($c >= 240) { $bits = 4; }

elseif($c >= 224) { $bits = 3; }

elseif($c >= 192) { $bits = 2; }

else { return false; }

if(($i + $bits) > $len) { return false; }

while($bits > 1) {

$i++;

$b = ord($str[$i]);

if($b < 128 || $b > 191) { return false; }

$bits--;

}

}

}

return true;

}


function get_display_size()

{

global $debug_object;


$width = -1;

$height = -1;


if ($this->tag !== 'img') {

return false;

}


// See if there is aheight or width attribute in the tag itself.

if (isset($this->attr['width'])) {

$width = $this->attr['width'];

}


if (isset($this->attr['height'])) {

$height = $this->attr['height'];

}


// Now look for an inline style.

if (isset($this->attr['style'])) {

// Thanks to user gnarf from stackoverflow for this regular expression.

$attributes = array();


preg_match_all(

'/([\w-]+)\s*:\s*([^;]+)\s*;?/',

$this->attr['style'],

$matches,

PREG_SET_ORDER

);


foreach ($matches as $match) {

$attributes[$match[1]] = $match[2];

}


// If there is a width in the style attributes:

if (isset($attributes['width']) && $width == -1) {

// check that the last two characters are px (pixels)

if (strtolower(substr($attributes['width'], -2)) === 'px') {

$proposed_width = substr($attributes['width'], 0, -2);

// Now make sure that it's an integer and not something stupid.

if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {

$width = $proposed_width;

}

}

}


// If there is a width in the style attributes:

if (isset($attributes['height']) && $height == -1) {

// check that the last two characters are px (pixels)

if (strtolower(substr($attributes['height'], -2)) == 'px') {

$proposed_height = substr($attributes['height'], 0, -2);

// Now make sure that it's an integer and not something stupid.

if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {

$height = $proposed_height;

}

}

}


}


// Future enhancement:

// Look in the tag to see if there is a class or id specified that has

// a height or width attribute to it.


// Far future enhancement

// Look at all the parent tags of this image to see if they specify a

// class or id that has an img selector that specifies a height or width

// Note that in this case, the class or id will have the img subselector

// for it to apply to the image.


// ridiculously far future development

// If the class or id is specified in a SEPARATE css file thats not on

// the page, go get it and do what we were just doing for the ones on

// the page.


$result = array(

'height' => $height,

'width' => $width

);


return $result;

}


function save($filepath = '')

{

$ret = $this->outertext();


if ($filepath !== '') {

file_put_contents($filepath, $ret, LOCK_EX);

}


return $ret;

}


function addClass($class)

{

if (is_string($class)) {

$class = explode(' ', $class);

}


if (is_array($class)) {

foreach($class as $c) {

if (isset($this->class)) {

if ($this->hasClass($c)) {

continue;

} else {

$this->class .= ' ' . $c;

}

} else {

$this->class = $c;

}

}

} else {

if (is_object($debug_object)) {

$debug_object->debug_log(2, 'Invalid type: ', gettype($class));

}

}

}


function hasClass($class)

{

if (is_string($class)) {

if (isset($this->class)) {

return in_array($class, explode(' ', $this->class), true);

}

} else {

if (is_object($debug_object)) {

$debug_object->debug_log(2, 'Invalid type: ', gettype($class));

}

}


return false;

}


function removeClass($class = null)

{

if (!isset($this->class)) {

return;

}


if (is_null($class)) {

$this->removeAttribute('class');

return;

}


if (is_string($class)) {

$class = explode(' ', $class);

}


if (is_array($class)) {

$class = array_diff(explode(' ', $this->class), $class);

if (empty($class)) {

$this->removeAttribute('class');

} else {

$this->class = implode(' ', $class);

}

}

}


function getAllAttributes()

{

return $this->attr;

}


function getAttribute($name)

{

return $this->__get($name);

}


function setAttribute($name, $value)

{

$this->__set($name, $value);

}


function hasAttribute($name)

{

return $this->__isset($name);

}


function removeAttribute($name)

{

$this->__set($name, null);

}


function remove()

{

if ($this->parent) {

$this->parent->removeChild($this);

}

}


function removeChild($node)

{

$nidx = array_search($node, $this->nodes, true);

$cidx = array_search($node, $this->children, true);

$didx = array_search($node, $this->dom->nodes, true);


if ($nidx !== false && $cidx !== false && $didx !== false) {


foreach($node->children as $child) {

$node->removeChild($child);

}


foreach($node->nodes as $entity) {

$enidx = array_search($entity, $node->nodes, true);

$edidx = array_search($entity, $node->dom->nodes, true);


if ($enidx !== false && $edidx !== false) {

unset($node->nodes[$enidx]);

unset($node->dom->nodes[$edidx]);

}

}


unset($this->nodes[$nidx]);

unset($this->children[$cidx]);

unset($this->dom->nodes[$didx]);


$node->clear();


}

}


function getElementById($id)

{

return $this->find("#$id", 0);

}


function getElementsById($id, $idx = null)

{

return $this->find("#$id", $idx);

}


function getElementByTagName($name)

{

return $this->find($name, 0);

}


function getElementsByTagName($name, $idx = null)

{

return $this->find($name, $idx);

}


function parentNode()

{

return $this->parent();

}


function childNodes($idx = -1)

{

return $this->children($idx);

}


function firstChild()

{

return $this->first_child();

}


function lastChild()

{

return $this->last_child();

}


function nextSibling()

{

return $this->next_sibling();

}


function previousSibling()

{

return $this->prev_sibling();

}


function hasChildNodes()

{

return $this->has_child();

}


function nodeName()

{

return $this->tag;

}


function appendChild($node)

{

$node->parent($this);

return $node;

}


}


class simple_html_dom

{

public $root = null;

public $nodes = array();

public $callback = null;

public $lowercase = false;

public $original_size;

public $size;


protected $pos;

protected $doc;

protected $char;


protected $cursor;

protected $parent;

protected $noise = array();

protected $token_blank = " \t\r\n";

protected $token_equal = ' =/>';

protected $token_slash = " />\r\n\t";

protected $token_attr = ' >';


public $_charset = '';

public $_target_charset = '';


protected $default_br_text = '';


public $default_span_text = '';


protected $self_closing_tags = array(

'area' => 1,

'base' => 1,

'br' => 1,

'col' => 1,

'embed' => 1,

'hr' => 1,

'img' => 1,

'input' => 1,

'link' => 1,

'meta' => 1,

'param' => 1,

'source' => 1,

'track' => 1,

'wbr' => 1

);

protected $block_tags = array(

'body' => 1,

'div' => 1,

'form' => 1,

'root' => 1,

'span' => 1,

'table' => 1

);

protected $optional_closing_tags = array(

// Not optional, see

// https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element

'b' => array('b' => 1),

'dd' => array('dd' => 1, 'dt' => 1),

// Not optional, see

// https://www.w3.org/TR/html/grouping-content.html#the-dl-element

'dl' => array('dd' => 1, 'dt' => 1),

'dt' => array('dd' => 1, 'dt' => 1),

'li' => array('li' => 1),

'optgroup' => array('optgroup' => 1, 'option' => 1),

'option' => array('optgroup' => 1, 'option' => 1),

'p' => array('p' => 1),

'rp' => array('rp' => 1, 'rt' => 1),

'rt' => array('rp' => 1, 'rt' => 1),

'td' => array('td' => 1, 'th' => 1),

'th' => array('td' => 1, 'th' => 1),

'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),

);


function __construct(

$str = null,

$lowercase = true,

$forceTagsClosed = true,

$target_charset = DEFAULT_TARGET_CHARSET,

$stripRN = true,

$defaultBRText = DEFAULT_BR_TEXT,

$defaultSpanText = DEFAULT_SPAN_TEXT,

$options = 0)

{

if ($str) {

if (preg_match('/^http:\/\//i', $str) || is_file($str)) {

$this->load_file($str);

} else {

$this->load(

$str,

$lowercase,

$stripRN,

$defaultBRText,

$defaultSpanText,

$options

);

}

}

// Forcing tags to be closed implies that we don't trust the html, but

// it can lead to parsing errors if we SHOULD trust the html.

if (!$forceTagsClosed) {

$this->optional_closing_array = array();

}


$this->_target_charset = $target_charset;

}


function __destruct()

{

$this->clear();

}


function load(

$str,

$lowercase = true,

$stripRN = true,

$defaultBRText = DEFAULT_BR_TEXT,

$defaultSpanText = DEFAULT_SPAN_TEXT,

$options = 0)

{

global $debug_object;


// prepare

$this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);


// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037

// Script tags removal now preceeds style tag removal.

// strip out <script> tags

$this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");

$this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");


// strip out the \r \n's if we are told to.

if ($stripRN) {

$this->doc = str_replace("\r", ' ', $this->doc);

$this->doc = str_replace("\n", ' ', $this->doc);


// set the length of content since we have changed it.

$this->size = strlen($this->doc);

}


// strip out cdata

$this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);

// strip out comments

$this->remove_noise("'<!--(.*?)-->'is");

// strip out <style> tags

$this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");

$this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");

// strip out preformatted tags

$this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");

// strip out server side scripts

$this->remove_noise("'(<\?)(.*?)(\?>)'s", true);


if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts

$this->remove_noise("'(\{\w)(.*?)(\})'s", true);

}


// parsing

$this->parse();

// end

$this->root->_[HDOM_INFO_END] = $this->cursor;

$this->parse_charset();


// make load function chainable

return $this;

}


function load_file()

{

$args = func_get_args();


if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {

$this->load($doc, true);

} else {

return false;

}

}


function set_callback($function_name)

{

$this->callback = $function_name;

}


function remove_callback()

{

$this->callback = null;

}


function save($filepath = '')

{

$ret = $this->root->innertext();

if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }

return $ret;

}


function find($selector, $idx = null, $lowercase = false)

{

return $this->root->find($selector, $idx, $lowercase);

}


function clear()

{

if (isset($this->nodes)) {

foreach ($this->nodes as $n) {

$n->clear();

$n = null;

}

}


// This add next line is documented in the sourceforge repository.

// 2977248 as a fix for ongoing memory leaks that occur even with the

// use of clear.

if (isset($this->children)) {

foreach ($this->children as $n) {

$n->clear();

$n = null;

}

}


if (isset($this->parent)) {

$this->parent->clear();

unset($this->parent);

}


if (isset($this->root)) {

$this->root->clear();

unset($this->root);

}


unset($this->doc);

unset($this->noise);

}


function dump($show_attr = true)

{

$this->root->dump($show_attr);

}


protected function prepare(

$str, $lowercase = true,

$defaultBRText = DEFAULT_BR_TEXT,

$defaultSpanText = DEFAULT_SPAN_TEXT)

{

$this->clear();


$this->doc = trim($str);

$this->size = strlen($this->doc);

$this->original_size = $this->size; // original size of the html

$this->pos = 0;

$this->cursor = 1;

$this->noise = array();

$this->nodes = array();

$this->lowercase = $lowercase;

$this->default_br_text = $defaultBRText;

$this->default_span_text = $defaultSpanText;

$this->root = new simple_html_dom_node($this);

$this->root->tag = 'root';

$this->root->_[HDOM_INFO_BEGIN] = -1;

$this->root->nodetype = HDOM_TYPE_ROOT;

$this->parent = $this->root;

if ($this->size > 0) { $this->char = $this->doc[0]; }

}


protected function parse()

{

while (true) {

// Read next tag if there is no text between current position and the

// next opening tag.

if (($s = $this->copy_until_char('<')) === '') {

if($this->read_tag()) {

continue;

} else {

return true;

}

}


// Add a text node for text between tags

$node = new simple_html_dom_node($this);

++$this->cursor;

$node->_[HDOM_INFO_TEXT] = $s;

$this->link_nodes($node, false);

}

}


protected function parse_charset()

{

global $debug_object;


$charset = null;


if (function_exists('get_last_retrieve_url_contents_content_type')) {

$contentTypeHeader = get_last_retrieve_url_contents_content_type();

$success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);

if ($success) {

$charset = $matches[1];

if (is_object($debug_object)) {

$debug_object->debug_log(2,

'header content-type found charset of: '

. $charset

);

}

}

}


if (empty($charset)) {

// https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type

$el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);


if (!empty($el)) {

$fullvalue = $el->content;

if (is_object($debug_object)) {

$debug_object->debug_log(2,

'meta content-type tag found'

. $fullvalue

);

}


if (!empty($fullvalue)) {

$success = preg_match(

'/charset=(.+)/i',

$fullvalue,

$matches

);


if ($success) {

$charset = $matches[1];

} else {

// If there is a meta tag, and they don't specify the

// character set, research says that it's typically

// ISO-8859-1

if (is_object($debug_object)) {

$debug_object->debug_log(2,

'meta content-type tag couldn\'t be parsed. using iso-8859 default.'

);

}


$charset = 'ISO-8859-1';

}

}

}

}


if (empty($charset)) {

// https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration

if ($meta = $this->root->find('meta[charset]', 0)) {

$charset = $meta->charset;

if (is_object($debug_object)) {

$debug_object->debug_log(2, 'meta charset: ' . $charset);

}

}

}


if (empty($charset)) {

// Try to guess the charset based on the content

// Requires Multibyte String (mbstring) support (optional)

if (function_exists('mb_detect_encoding')) {

/**

* mb_detect_encoding() is not intended to distinguish between

* charsets, especially single-byte charsets. Its primary

* purpose is to detect which multibyte encoding is in use,

* i.e. UTF-8, UTF-16, shift-JIS, etc.

*

* -- https://bugs.php.net/bug.php?id=38138

*

* Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will

* always result in CP1251/ISO-8859-5 and vice versa.

*

* Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1

* to stay compatible.

*/

$encoding = mb_detect_encoding(

$this->doc,

array( 'UTF-8', 'CP1252', 'ISO-8859-1' )

);


if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') {

// Due to a limitation of mb_detect_encoding

// 'CP1251'/'ISO-8859-5' will be detected as

// 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in

// which case we can simply assume it is the other charset.

if (!@iconv('CP1252', 'UTF-8', $this->doc)) {

$encoding = 'CP1251';

}

}


if ($encoding !== false) {

$charset = $encoding;

if (is_object($debug_object)) {

$debug_object->debug_log(2, 'mb_detect: ' . $charset);

}

}

}

}


if (empty($charset)) {

// Assume it's UTF-8 as it is the most likely charset to be used

$charset = 'UTF-8';

if (is_object($debug_object)) {

$debug_object->debug_log(2, 'No match found, assume ' . $charset);

}

}


// Since CP1252 is a superset, if we get one of it's subsets, we want

// it instead.

if ((strtolower($charset) == 'iso-8859-1')

|| (strtolower($charset) == 'latin1')

|| (strtolower($charset) == 'latin-1')) {

$charset = 'CP1252';

if (is_object($debug_object)) {

$debug_object->debug_log(2,

'replacing ' . $charset . ' with CP1252 as its a superset'

);

}

}


if (is_object($debug_object)) {

$debug_object->debug_log(1, 'EXIT - ' . $charset);

}


return $this->_charset = $charset;

}


protected function read_tag()

{

// Set end position if no further tags found

if ($this->char !== '<') {

$this->root->_[HDOM_INFO_END] = $this->cursor;

return false;

}


$begin_tag_pos = $this->pos;

$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next


// end tag

if ($this->char === '/') {

$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next


// Skip whitespace in end tags (i.e. in "</   html>")

$this->skip($this->token_blank);

$tag = $this->copy_until_char('>');


// Skip attributes in end tags

if (($pos = strpos($tag, ' ')) !== false) {

$tag = substr($tag, 0, $pos);

}


$parent_lower = strtolower($this->parent->tag);

$tag_lower = strtolower($tag);


// The end tag is supposed to close the parent tag. Handle situations

// when it doesn't

if ($parent_lower !== $tag_lower) {

// Parent tag does not have to be closed necessarily (optional closing tag)

// Current tag is a block tag, so it may close an ancestor

if (isset($this->optional_closing_tags[$parent_lower])

&& isset($this->block_tags[$tag_lower])) {


$this->parent->_[HDOM_INFO_END] = 0;

$org_parent = $this->parent;


// Traverse ancestors to find a matching opening tag

// Stop at root node

while (($this->parent->parent)

&& strtolower($this->parent->tag) !== $tag_lower

){

$this->parent = $this->parent->parent;

}


// If we don't have a match add current tag as text node

if (strtolower($this->parent->tag) !== $tag_lower) {

$this->parent = $org_parent; // restore origonal parent


if ($this->parent->parent) {

$this->parent = $this->parent->parent;

}


$this->parent->_[HDOM_INFO_END] = $this->cursor;

return $this->as_text_node($tag);

}

} elseif (($this->parent->parent)

&& isset($this->block_tags[$tag_lower])

) {

// Grandparent exists and current tag is a block tag, so our

// parent doesn't have an end tag

$this->parent->_[HDOM_INFO_END] = 0; // No end tag

$org_parent = $this->parent;


// Traverse ancestors to find a matching opening tag

// Stop at root node

while (($this->parent->parent)

&& strtolower($this->parent->tag) !== $tag_lower

) {

$this->parent = $this->parent->parent;

}


// If we don't have a match add current tag as text node

if (strtolower($this->parent->tag) !== $tag_lower) {

$this->parent = $org_parent; // restore origonal parent

$this->parent->_[HDOM_INFO_END] = $this->cursor;

return $this->as_text_node($tag);

}

} elseif (($this->parent->parent)

&& strtolower($this->parent->parent->tag) === $tag_lower

) { // Grandparent exists and current tag closes it

$this->parent->_[HDOM_INFO_END] = 0;

$this->parent = $this->parent->parent;

} else { // Random tag, add as text node

return $this->as_text_node($tag);

}

}


// Set end position of parent tag to current cursor position

$this->parent->_[HDOM_INFO_END] = $this->cursor;


if ($this->parent->parent) {

$this->parent = $this->parent->parent;

}


$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next

return true;

}


// start tag

$node = new simple_html_dom_node($this);

$node->_[HDOM_INFO_BEGIN] = $this->cursor;

++$this->cursor;

$tag = $this->copy_until($this->token_slash); // Get tag name

$node->tag_start = $begin_tag_pos;


// doctype, cdata & comments...

// <!DOCTYPE html>

// <![CDATA[ ... ]]>

// <!-- Comment -->

if (isset($tag[0]) && $tag[0] === '!') {

$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');


if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")

$node->nodetype = HDOM_TYPE_COMMENT;

$node->tag = 'comment';

} else { // Could be doctype or CDATA but we don't care

$node->nodetype = HDOM_TYPE_UNKNOWN;

$node->tag = 'unknown';

}


if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }


$this->link_nodes($node, true);

$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next

return true;

}


// The start tag cannot contain another start tag, if so add as text

// i.e. "<<html>"

if ($pos = strpos($tag, '<') !== false) {

$tag = '<' . substr($tag, 0, -1);

$node->_[HDOM_INFO_TEXT] = $tag;

$this->link_nodes($node, false);

$this->char = $this->doc[--$this->pos]; // prev

return true;

}


// Handle invalid tag names (i.e. "<html#doc>")

if (!preg_match('/^\w[\w:-]*$/', $tag)) {

$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');


// Next char is the beginning of a new tag, don't touch it.

if ($this->char === '<') {

$this->link_nodes($node, false);

return true;

}


// Next char closes current tag, add and be done with it.

if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }

$this->link_nodes($node, false);

$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next

return true;

}


// begin tag, add new node

$node->nodetype = HDOM_TYPE_ELEMENT;

$tag_lower = strtolower($tag);

$node->tag = ($this->lowercase) ? $tag_lower : $tag;


// handle optional closing tags

if (isset($this->optional_closing_tags[$tag_lower])) {

// Traverse ancestors to close all optional closing tags

while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {

$this->parent->_[HDOM_INFO_END] = 0;

$this->parent = $this->parent->parent;

}

$node->parent = $this->parent;

}


$guard = 0; // prevent infinity loop


// [0] Space between tag and first attribute

$space = array($this->copy_skip($this->token_blank), '', '');


// attributes

do {

// Everything until the first equal sign should be the attribute name

$name = $this->copy_until($this->token_equal);


if ($name === '' && $this->char !== null && $space[0] === '') {

break;

}


if ($guard === $this->pos) { // Escape infinite loop

$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next

continue;

}


$guard = $this->pos;


// handle endless '<'

// Out of bounds before the tag ended

if ($this->pos >= $this->size - 1 && $this->char !== '>') {

$node->nodetype = HDOM_TYPE_TEXT;

$node->_[HDOM_INFO_END] = 0;

$node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;

$node->tag = 'text';

$this->link_nodes($node, false);

return true;

}


// handle mismatch '<'

// Attributes cannot start after opening tag

if ($this->doc[$this->pos - 1] == '<') {

$node->nodetype = HDOM_TYPE_TEXT;

$node->tag = 'text';

$node->attr = array();

$node->_[HDOM_INFO_END] = 0;

$node->_[HDOM_INFO_TEXT] = substr(

$this->doc,

$begin_tag_pos,

$this->pos - $begin_tag_pos - 1

);

$this->pos -= 2;

$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next

$this->link_nodes($node, false);

return true;

}


if ($name !== '/' && $name !== '') { // this is a attribute name

// [1] Whitespace after attribute name

$space[1] = $this->copy_skip($this->token_blank);


$name = $this->restore_noise($name); // might be a noisy name


if ($this->lowercase) { $name = strtolower($name); }


if ($this->char === '=') { // attribute with value

$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next

$this->parse_attr($node, $name, $space); // get attribute value

} else {

//no value attr: nowrap, checked selected...

$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;

$node->attr[$name] = true;

if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev

}


$node->_[HDOM_INFO_SPACE][] = $space;


// prepare for next attribute

$space = array(

$this->copy_skip($this->token_blank),

'',

''

);

} else { // no more attributes

break;

}

} while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended


$this->link_nodes($node, true);

$node->_[HDOM_INFO_ENDSPACE] = $space[0];


// handle empty tags (i.e. "<div/>")

if ($this->copy_until_char('>') === '/') {

$node->_[HDOM_INFO_ENDSPACE] .= '/';

$node->_[HDOM_INFO_END] = 0;

} else {

// reset parent

if (!isset($this->self_closing_tags[strtolower($node->tag)])) {

$this->parent = $node;

}

}


$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next


// If it's a BR tag, we need to set it's text to the default text.

// This way when we see it in plaintext, we can generate formatting that the user wants.

// since a br tag never has sub nodes, this works well.

if ($node->tag === 'br') {

$node->_[HDOM_INFO_INNER] = $this->default_br_text;

}


return true;

}


protected function parse_attr($node, $name, &$space)

{

$is_duplicate = isset($node->attr[$name]);


if (!$is_duplicate) // Copy whitespace between "=" and value

$space[2] = $this->copy_skip($this->token_blank);


switch ($this->char) {

case '"':

$quote_type = HDOM_QUOTE_DOUBLE;

$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next

$value = $this->copy_until_char('"');

$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next

break;

case '\'':

$quote_type = HDOM_QUOTE_SINGLE;

$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next

$value = $this->copy_until_char('\'');

$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next

break;

default:

$quote_type = HDOM_QUOTE_NO;

$value = $this->copy_until($this->token_attr);

}


$value = $this->restore_noise($value);


// PaperG: Attributes should not have \r or \n in them, that counts as

// html whitespace.

$value = str_replace("\r", '', $value);

$value = str_replace("\n", '', $value);


// PaperG: If this is a "class" selector, lets get rid of the preceeding

// and trailing space since some people leave it in the multi class case.

if ($name === 'class') {

$value = trim($value);

}


if (!$is_duplicate) {

$node->_[HDOM_INFO_QUOTE][] = $quote_type;

$node->attr[$name] = $value;

}

}


protected function link_nodes(&$node, $is_child)

{

$node->parent = $this->parent;

$this->parent->nodes[] = $node;

if ($is_child) {

$this->parent->children[] = $node;

}

}


protected function as_text_node($tag)

{

$node = new simple_html_dom_node($this);

++$this->cursor;

$node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';

$this->link_nodes($node, false);

$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next

return true;

}


protected function skip($chars)

{

$this->pos += strspn($this->doc, $chars, $this->pos);

$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next

}


protected function copy_skip($chars)

{

$pos = $this->pos;

$len = strspn($this->doc, $chars, $pos);

$this->pos += $len;

$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next

if ($len === 0) { return ''; }

return substr($this->doc, $pos, $len);

}


protected function copy_until($chars)

{

$pos = $this->pos;

$len = strcspn($this->doc, $chars, $pos);

$this->pos += $len;

$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next

return substr($this->doc, $pos, $len);

}


protected function copy_until_char($char)

{

if ($this->char === null) { return ''; }


if (($pos = strpos($this->doc, $char, $this->pos)) === false) {

$ret = substr($this->doc, $this->pos, $this->size - $this->pos);

$this->char = null;

$this->pos = $this->size;

return $ret;

}


if ($pos === $this->pos) { return ''; }


$pos_old = $this->pos;

$this->char = $this->doc[$pos];

$this->pos = $pos;

return substr($this->doc, $pos_old, $pos - $pos_old);

}


protected function remove_noise($pattern, $remove_tag = false)

{

global $debug_object;

if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }


$count = preg_match_all(

$pattern,

$this->doc,

$matches,

PREG_SET_ORDER | PREG_OFFSET_CAPTURE

);


for ($i = $count - 1; $i > -1; --$i) {

$key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);


if (is_object($debug_object)) {

$debug_object->debug_log(2, 'key is: ' . $key);

}


$idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch

$this->noise[$key] = $matches[$i][$idx][0];

$this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));

}


// reset the length of content

$this->size = strlen($this->doc);


if ($this->size > 0) {

$this->char = $this->doc[0];

}

}


function restore_noise($text)

{

global $debug_object;

if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }


while (($pos = strpos($text, '___noise___')) !== false) {

// Sometimes there is a broken piece of markup, and we don't GET the

// pos+11 etc... token which indicates a problem outside of us...


// todo: "___noise___1000" (or any number with four or more digits)

// in the DOM causes an infinite loop which could be utilized by

// malicious software

if (strlen($text) > $pos + 15) {

$key = '___noise___'

. $text[$pos + 11]

. $text[$pos + 12]

. $text[$pos + 13]

. $text[$pos + 14]

. $text[$pos + 15];


if (is_object($debug_object)) {

$debug_object->debug_log(2, 'located key of: ' . $key);

}


if (isset($this->noise[$key])) {

$text = substr($text, 0, $pos)

. $this->noise[$key]

. substr($text, $pos + 16);

} else {

// do this to prevent an infinite loop.

$text = substr($text, 0, $pos)

. 'UNDEFINED NOISE FOR KEY: '

. $key

. substr($text, $pos + 16);

}

} else {

// There is no valid key being given back to us... We must get

// rid of the ___noise___ or we will have a problem.

$text = substr($text, 0, $pos)

. 'NO NUMERIC NOISE KEY'

. substr($text, $pos + 11);

}

}

return $text;

}


function search_noise($text)

{

global $debug_object;

if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }


foreach($this->noise as $noiseElement) {

if (strpos($noiseElement, $text) !== false) {

return $noiseElement;

}

}

}


function __toString()

{

return $this->root->innertext();

}


function __get($name)

{

switch ($name) {

case 'outertext':

return $this->root->innertext();

case 'innertext':

return $this->root->innertext();

case 'plaintext':

return $this->root->text();

case 'charset':

return $this->_charset;

case 'target_charset':

return $this->_target_charset;

}

}


function childNodes($idx = -1)

{

return $this->root->childNodes($idx);

}


function firstChild()

{

return $this->root->first_child();

}


function lastChild()

{

return $this->root->last_child();

}


function createElement($name, $value = null)

{

return @str_get_html("<$name>$value</$name>")->firstChild();

}


function createTextNode($value)

{

return @end(str_get_html($value)->nodes);

}


function getElementById($id)

{

return $this->find("#$id", 0);

}


function getElementsById($id, $idx = null)

{

return $this->find("#$id", $idx);

}


function getElementByTagName($name)

{

return $this->find($name, 0);

}


function getElementsByTagName($name, $idx = -1)

{

return $this->find($name, $idx);

}


function loadFile()

{

$args = func_get_args();

$this->load_file($args);

}

}