368 lines
		
	
	
		
			7.9 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			368 lines
		
	
	
		
			7.9 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| <?php
 | |
| 
 | |
| /**
 | |
|  * @author          Tassos Marinos <info@tassos.gr>
 | |
|  * @link            https://www.tassos.gr
 | |
|  * @copyright       Copyright © 2024 Tassos All Rights Reserved
 | |
|  * @license         GNU GPLv3 <http://www.gnu.org/licenses/gpl.html> or later
 | |
|  */
 | |
| 
 | |
| namespace NRFramework;
 | |
| 
 | |
| use Joomla\String\StringHelper;
 | |
| use NRFramework\Cache;
 | |
| 
 | |
| defined('_JEXEC') or die;
 | |
| 
 | |
| class DOMCrawler
 | |
| {
 | |
| 	/**
 | |
| 	 * The content to craw
 | |
| 	 *
 | |
| 	 * @var string
 | |
| 	 */
 | |
|     protected $content;
 | |
| 
 | |
| 	/**
 | |
| 	 * The nodes discovered by crawling
 | |
| 	 *
 | |
| 	 * @var object
 | |
| 	 */
 | |
|     public $nodes;
 | |
| 
 | |
| 	/**
 | |
| 	 * Class constructor
 | |
| 	 *
 | |
| 	 * @param mixed $content	The content to crawl. Defaults
 | |
| 	 */
 | |
|     public function __construct($content = null)
 | |
|     {   
 | |
|         if (is_null($content))
 | |
|         {
 | |
|             $content = \NRFramework\Functions::getBuffer();
 | |
|         }
 | |
| 
 | |
|         $this->setContent($content);
 | |
|     }
 | |
| 
 | |
| 	/**
 | |
| 	 * Set content to crawl
 | |
| 	 *
 | |
| 	 * @param string $content	The content to crawl. Defaults
 | |
| 
 | |
| 	 * @return void
 | |
| 	 */
 | |
|     public function setContent($content)
 | |
|     {
 | |
|         $this->content = $this->stringToUTF8($content);
 | |
|     }
 | |
| 
 | |
| 	/**
 | |
| 	 * Filter dom elements with a CSS Selector or XPath expression
 | |
| 	 *
 | |
| 	 * @param	string	$expression	 A CSS Selector or XPath expression
 | |
| 	 * 
 | |
| 	 * @return	void
 | |
| 	 */
 | |
|     public function filter($expression)
 | |
|     {
 | |
|         // If empty content, return
 | |
|         if (empty($this->content))
 | |
|         {
 | |
| 			return $this;
 | |
|         }
 | |
| 
 | |
|         // If empty selector, return
 | |
| 		if (empty($expression))
 | |
| 		{
 | |
| 			return $this;
 | |
| 		}
 | |
| 
 | |
| 		if (!class_exists('DOMDocument') || !class_exists('DOMXPath'))
 | |
| 		{
 | |
| 			return $this;
 | |
| 		}
 | |
| 
 | |
| 		// Cache check
 | |
| 		$hash = md5($expression);
 | |
| 
 | |
| 		if (Cache::has($hash))
 | |
| 		{
 | |
| 			$this->nodes = Cache::get($hash, false);
 | |
| 			return $this;
 | |
| 		}
 | |
| 
 | |
| 		libxml_use_internal_errors(true);
 | |
| 		$dom = new \DOMDocument;
 | |
| 		$dom->loadHTML($this->content);
 | |
| 		$finder = new \DOMXPath($dom);
 | |
| 		
 | |
| 		// Check if we are writing our own XPath query
 | |
| 		// example: =//h1[contains(@class, "faq-question")]
 | |
| 		if (substr($expression, 0, 1) == '=')
 | |
| 		{
 | |
| 			$xpath = StringHelper::substr($expression, 1);
 | |
| 		}
 | |
| 		else
 | |
| 		{
 | |
| 			// Create the XPath via the provided selector
 | |
| 			$xpath = $this->cssSelectorToXPath($expression);
 | |
| 		}
 | |
| 
 | |
|     	$this->nodes = $finder->query($xpath);
 | |
| 
 | |
| 		// Speed up filtering by caching results
 | |
| 		Cache::set($hash, $this->nodes);
 | |
| 
 | |
|         return $this;
 | |
|     }
 | |
| 
 | |
| 	/**
 | |
| 	 * Returns the HTML of the first discovered node
 | |
| 	 *
 | |
| 	 * @param	string	$fallback	The fallback text to return if no node is found
 | |
| 	 * @param	boolean $inner		If set to true, only the node's inner HTML will be returned.
 | |
| 	 * @param	boolean $firstOnly	If set to true, only the first node will be returned.
 | |
| 	 * 
 | |
| 	 * @return	string
 | |
| 	 */
 | |
|     public function html($fallback = '', $inner = false, $firstOnly = true)
 | |
|     {
 | |
| 		if (!$this->nodes || !$this->nodes->length)
 | |
| 		{
 | |
| 			return $fallback;
 | |
| 		}
 | |
| 
 | |
| 		if ($firstOnly)
 | |
| 		{
 | |
| 			return $this->cleanText($this->getNodeHTML($this->nodes[0], $inner));
 | |
| 		}
 | |
| 
 | |
| 		$result = [];
 | |
| 
 | |
| 		foreach ($this->nodes as $node)
 | |
| 		{
 | |
| 			$result[] = $this->cleanText($this->getNodeHTML($node, $inner));
 | |
| 		}
 | |
| 
 | |
| 		return $result;
 | |
|     }
 | |
| 
 | |
| 	/**
 | |
| 	 * Returns the text of the 1st discovered node.
 | |
| 	 *
 | |
| 	 * @param	string	$fallback	The fallback text to return if no node is found
 | |
| 	 * @param	boolean $firstOnly	If set to true, only the first node will be returned.
 | |
| 	 * 
 | |
| 	 * @return	string
 | |
| 	 */
 | |
|     public function text($fallback = '', $firstOnly = true)
 | |
|     {
 | |
| 		if (!$this->nodes || !$this->nodes->length)
 | |
| 		{
 | |
| 			return $fallback;
 | |
| 		}
 | |
| 
 | |
| 		if ($firstOnly)
 | |
| 		{
 | |
| 			return $this->cleanText($this->nodes[0]->textContent);
 | |
| 		}
 | |
| 
 | |
| 		$result = [];
 | |
| 
 | |
| 		foreach ($this->nodes as $node)
 | |
| 		{
 | |
| 			$result[] = $this->cleanText($node->textContent);
 | |
| 		}
 | |
| 
 | |
| 		return $result;
 | |
|     }
 | |
| 
 | |
| 	/**
 | |
| 	 * Returns the attribute value of the 1st discovered node
 | |
| 	 *
 | |
| 	 * @param	string	$attribute_name		The name of the attribute
 | |
| 	 * @param	string	$fallback			The fallback text to return if no nodes found
 | |
| 	 * @param	boolean $firstOnly			If set to true, only the first node will be returned.
 | |
| 	 * 
 | |
| 	 * @return string
 | |
| 	 */
 | |
|     public function attr($attribute_name, $fallback = '', $firstOnly = true)
 | |
|     {
 | |
| 		if (!$this->nodes || !$this->nodes->length)
 | |
| 		{
 | |
| 			return $fallback;
 | |
| 		}
 | |
| 
 | |
| 		if ($firstOnly)
 | |
| 		{
 | |
| 			return $this->cleanText($this->nodes[0]->getAttribute($attribute_name));
 | |
| 		}
 | |
| 
 | |
| 		$result = [];
 | |
| 
 | |
| 		foreach ($this->nodes as $node)
 | |
| 		{
 | |
| 			$result[] = $this->cleanText($node->getAttribute($attribute_name));
 | |
| 		}
 | |
| 
 | |
| 		return $result;
 | |
|     }
 | |
| 
 | |
| 	/**
 | |
| 	 * Returns the total number of nodes found
 | |
| 	 *
 | |
| 	 * @param	integer	$fallback	The fallback value number to return if no nodes found
 | |
| 	 * 
 | |
| 	 * @return	integer
 | |
| 	 */
 | |
|     public function count($fallback = 0)
 | |
|     {
 | |
| 		return $this->nodes && $this->nodes->length ? $this->nodes->length : $fallback;
 | |
|     }
 | |
| 
 | |
| 	/**
 | |
| 	 * Helper method to crawl page based on the value of a CSS Selector field.
 | |
| 	 *
 | |
| 	 * @param array $props	Expected properties: selector, task, attr
 | |
| 	 * 
 | |
| 	 * @return string
 | |
| 	 */
 | |
| 	public function readCSSSelectorField($props, $firstOnly = true)
 | |
| 	{
 | |
| 		$props = (array) $props;
 | |
| 		$fallback = $firstOnly ? '' : [];
 | |
| 		
 | |
| 		if (empty($props['selector']))
 | |
| 		{
 | |
| 			return $fallback;
 | |
| 		}
 | |
| 		
 | |
| 		$this->filter($props['selector']);
 | |
| 
 | |
| 		switch ($props['task'])
 | |
| 		{
 | |
| 			case 'html':
 | |
| 				return $this->html($fallback, false, $firstOnly);
 | |
| 			
 | |
| 			case 'innerhtml':
 | |
| 				return $this->html($fallback, true, $firstOnly);
 | |
| 
 | |
| 			case 'attr':
 | |
| 				return $this->attr($props['attr'], $fallback, $firstOnly);
 | |
| 
 | |
| 			case 'count':
 | |
| 				return $this->count();
 | |
| 
 | |
| 			default:
 | |
| 				return $this->text($fallback, $firstOnly);
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	/**
 | |
| 	 * Helper method to clean the text 
 | |
| 	 *
 | |
| 	 * @param	string	$text	The text to clean
 | |
| 	 * 
 | |
| 	 * @return string
 | |
| 	 */
 | |
| 	private function cleanText($text)
 | |
| 	{
 | |
| 		return StringHelper::trim($text);
 | |
| 	}
 | |
| 
 | |
|     /**
 | |
| 	 * Transforms the CSS Selector to a valid XPath expression
 | |
| 	 * 
 | |
| 	 * @param   string  $selector	The CSS selector to transform
 | |
| 	 * 
 | |
| 	 * @return  string	XPath expression
 | |
| 	 */
 | |
| 	private function cssSelectorToXPath($selector)
 | |
| 	{
 | |
| 		// explode() the given selectors and create a XPath syntax
 | |
| 		$selectors = explode(' ', $selector);
 | |
| 
 | |
| 		$xpath = '';
 | |
| 
 | |
| 		foreach ($selectors as $selector)
 | |
| 		{
 | |
| 			// Check if the selector contains a class or ID
 | |
| 			$explode_class = explode('.', $selector);
 | |
| 			$explode_id = explode('#', $selector);
 | |
| 
 | |
| 			// Selector contains a class
 | |
| 			if (count($explode_class) > 1)
 | |
| 			{
 | |
| 				$prefix = (isset($explode_class[0]) && !empty($explode_class[0])) ? $explode_class[0] : '*';
 | |
| 				$xpath .= '//' . $prefix . '[';
 | |
| 
 | |
| 				// When we use a selector such as div.class1.class2 or .class1.class2
 | |
| 				// we need to use all classes in the xpath and no the first one only
 | |
| 				unset($explode_class[0]);
 | |
| 				$total = count($explode_class);
 | |
| 				$counter = 1;
 | |
| 				$xpath_and_prefix = 'and';
 | |
| 
 | |
| 				foreach ($explode_class as $class)
 | |
| 				{
 | |
| 					$xpath .= ($counter != 1) ? $xpath_and_prefix : '';
 | |
| 					$xpath .=  ' contains(concat(" ", normalize-space(@class), " "), " ' . $class . ' ") ';
 | |
| 					$counter++;
 | |
| 				}
 | |
| 				
 | |
| 				$xpath .=  ']';
 | |
| 			}
 | |
| 			else if (count($explode_id) > 1) // Selector contains an ID
 | |
| 			{
 | |
| 				$prefix = (isset($explode_id[0]) && !empty($explode_id[0])) ? $explode_id[0] : '*';
 | |
| 				$xpath .= './/' . $prefix . '[@id="' . $explode_id[1] . '"]';
 | |
| 			}
 | |
| 			else // No class or ID given
 | |
| 			{
 | |
| 				$xpath .= '//' . $selector;
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		return $xpath;
 | |
| 	}
 | |
| 
 | |
|     /**
 | |
| 	 * Convert a string to UTF8 encoding for non-latin languages
 | |
| 	 * 
 | |
| 	 * @param  string
 | |
| 	 * 
 | |
| 	 * @return string
 | |
| 	 */
 | |
| 	private function stringToUTF8($string)
 | |
| 	{
 | |
| 		$string = iconv('UTF-8', 'UTF-8', $string);
 | |
| 		$string = mb_encode_numericentity($string, [0x80, 0x10FFFF, 0, 0x1FFFFF], 'UTF-8');
 | |
| 		return $string;
 | |
| 	}
 | |
| 
 | |
| 	/**
 | |
| 	 * Helper method to return the outer or inner HTML of a node
 | |
| 	 *
 | |
| 	 * @param	Node		$node	The node object
 | |
| 	 * @param	boolean		$inner	Whether to return the outer or inner HTML
 | |
| 	 * 
 | |
| 	 * @return	string		The HTML of the node
 | |
| 	 */
 | |
| 	private function getNodeHTML($node, $inner = true)
 | |
| 	{
 | |
| 		if ($inner)
 | |
| 		{
 | |
| 			$html = '';
 | |
| 		
 | |
| 			foreach ($node->childNodes as $child) 
 | |
| 			{ 
 | |
| 				$html .= $node->ownerDocument->saveHTML($child);
 | |
| 			}
 | |
| 
 | |
| 			return $html;
 | |
| 		} 
 | |
| 		
 | |
| 		return $node->ownerDocument->saveHTML($node);
 | |
| 	}
 | |
| } |