Files
conservatorio-tomadini/plugins/system/nrframework/NRFramework/DOMCrawler.php
2024-12-31 11:07:09 +01:00

368 lines
7.9 KiB
PHP

<?php
/**
* @author Tassos Marinos <info@tassos.gr>
* @link https://www.tassos.gr
* @copyright Copyright © 2024 Tassos All Rights Reserved
* @license GNU GPLv3 <http://www.gnu.org/licenses/gpl.html> or later
*/
namespace NRFramework;
use Joomla\String\StringHelper;
use NRFramework\Cache;
defined('_JEXEC') or die;
class DOMCrawler
{
/**
* The content to craw
*
* @var string
*/
protected $content;
/**
* The nodes discovered by crawling
*
* @var object
*/
public $nodes;
/**
* Class constructor
*
* @param mixed $content The content to crawl. Defaults
*/
public function __construct($content = null)
{
if (is_null($content))
{
$content = \NRFramework\Functions::getBuffer();
}
$this->setContent($content);
}
/**
* Set content to crawl
*
* @param string $content The content to crawl. Defaults
* @return void
*/
public function setContent($content)
{
$this->content = $this->stringToUTF8($content);
}
/**
* Filter dom elements with a CSS Selector or XPath expression
*
* @param string $expression A CSS Selector or XPath expression
*
* @return void
*/
public function filter($expression)
{
// If empty content, return
if (empty($this->content))
{
return $this;
}
// If empty selector, return
if (empty($expression))
{
return $this;
}
if (!class_exists('DOMDocument') || !class_exists('DOMXPath'))
{
return $this;
}
// Cache check
$hash = md5($expression);
if (Cache::has($hash))
{
$this->nodes = Cache::get($hash, false);
return $this;
}
libxml_use_internal_errors(true);
$dom = new \DOMDocument;
$dom->loadHTML($this->content);
$finder = new \DOMXPath($dom);
// Check if we are writing our own XPath query
// example: =//h1[contains(@class, "faq-question")]
if (substr($expression, 0, 1) == '=')
{
$xpath = StringHelper::substr($expression, 1);
}
else
{
// Create the XPath via the provided selector
$xpath = $this->cssSelectorToXPath($expression);
}
$this->nodes = $finder->query($xpath);
// Speed up filtering by caching results
Cache::set($hash, $this->nodes);
return $this;
}
/**
* Returns the HTML of the first discovered node
*
* @param string $fallback The fallback text to return if no node is found
* @param boolean $inner If set to true, only the node's inner HTML will be returned.
* @param boolean $firstOnly If set to true, only the first node will be returned.
*
* @return string
*/
public function html($fallback = '', $inner = false, $firstOnly = true)
{
if (!$this->nodes || !$this->nodes->length)
{
return $fallback;
}
if ($firstOnly)
{
return $this->cleanText($this->getNodeHTML($this->nodes[0], $inner));
}
$result = [];
foreach ($this->nodes as $node)
{
$result[] = $this->cleanText($this->getNodeHTML($node, $inner));
}
return $result;
}
/**
* Returns the text of the 1st discovered node.
*
* @param string $fallback The fallback text to return if no node is found
* @param boolean $firstOnly If set to true, only the first node will be returned.
*
* @return string
*/
public function text($fallback = '', $firstOnly = true)
{
if (!$this->nodes || !$this->nodes->length)
{
return $fallback;
}
if ($firstOnly)
{
return $this->cleanText($this->nodes[0]->textContent);
}
$result = [];
foreach ($this->nodes as $node)
{
$result[] = $this->cleanText($node->textContent);
}
return $result;
}
/**
* Returns the attribute value of the 1st discovered node
*
* @param string $attribute_name The name of the attribute
* @param string $fallback The fallback text to return if no nodes found
* @param boolean $firstOnly If set to true, only the first node will be returned.
*
* @return string
*/
public function attr($attribute_name, $fallback = '', $firstOnly = true)
{
if (!$this->nodes || !$this->nodes->length)
{
return $fallback;
}
if ($firstOnly)
{
return $this->cleanText($this->nodes[0]->getAttribute($attribute_name));
}
$result = [];
foreach ($this->nodes as $node)
{
$result[] = $this->cleanText($node->getAttribute($attribute_name));
}
return $result;
}
/**
* Returns the total number of nodes found
*
* @param integer $fallback The fallback value number to return if no nodes found
*
* @return integer
*/
public function count($fallback = 0)
{
return $this->nodes && $this->nodes->length ? $this->nodes->length : $fallback;
}
/**
* Helper method to crawl page based on the value of a CSS Selector field.
*
* @param array $props Expected properties: selector, task, attr
*
* @return string
*/
public function readCSSSelectorField($props, $firstOnly = true)
{
$props = (array) $props;
$fallback = $firstOnly ? '' : [];
if (empty($props['selector']))
{
return $fallback;
}
$this->filter($props['selector']);
switch ($props['task'])
{
case 'html':
return $this->html($fallback, false, $firstOnly);
case 'innerhtml':
return $this->html($fallback, true, $firstOnly);
case 'attr':
return $this->attr($props['attr'], $fallback, $firstOnly);
case 'count':
return $this->count();
default:
return $this->text($fallback, $firstOnly);
}
}
/**
* Helper method to clean the text
*
* @param string $text The text to clean
*
* @return string
*/
private function cleanText($text)
{
return StringHelper::trim($text);
}
/**
* Transforms the CSS Selector to a valid XPath expression
*
* @param string $selector The CSS selector to transform
*
* @return string XPath expression
*/
private function cssSelectorToXPath($selector)
{
// explode() the given selectors and create a XPath syntax
$selectors = explode(' ', $selector);
$xpath = '';
foreach ($selectors as $selector)
{
// Check if the selector contains a class or ID
$explode_class = explode('.', $selector);
$explode_id = explode('#', $selector);
// Selector contains a class
if (count($explode_class) > 1)
{
$prefix = (isset($explode_class[0]) && !empty($explode_class[0])) ? $explode_class[0] : '*';
$xpath .= '//' . $prefix . '[';
// When we use a selector such as div.class1.class2 or .class1.class2
// we need to use all classes in the xpath and no the first one only
unset($explode_class[0]);
$total = count($explode_class);
$counter = 1;
$xpath_and_prefix = 'and';
foreach ($explode_class as $class)
{
$xpath .= ($counter != 1) ? $xpath_and_prefix : '';
$xpath .= ' contains(concat(" ", normalize-space(@class), " "), " ' . $class . ' ") ';
$counter++;
}
$xpath .= ']';
}
else if (count($explode_id) > 1) // Selector contains an ID
{
$prefix = (isset($explode_id[0]) && !empty($explode_id[0])) ? $explode_id[0] : '*';
$xpath .= './/' . $prefix . '[@id="' . $explode_id[1] . '"]';
}
else // No class or ID given
{
$xpath .= '//' . $selector;
}
}
return $xpath;
}
/**
* Convert a string to UTF8 encoding for non-latin languages
*
* @param string
*
* @return string
*/
private function stringToUTF8($string)
{
$string = iconv('UTF-8', 'UTF-8', $string);
$string = mb_encode_numericentity($string, [0x80, 0x10FFFF, 0, 0x1FFFFF], 'UTF-8');
return $string;
}
/**
* Helper method to return the outer or inner HTML of a node
*
* @param Node $node The node object
* @param boolean $inner Whether to return the outer or inner HTML
*
* @return string The HTML of the node
*/
private function getNodeHTML($node, $inner = true)
{
if ($inner)
{
$html = '';
foreach ($node->childNodes as $child)
{
$html .= $node->ownerDocument->saveHTML($child);
}
return $html;
}
return $node->ownerDocument->saveHTML($node);
}
}