primo commit
This commit is contained in:
158
administrator/components/com_finder/src/Indexer/Parser/Html.php
Normal file
158
administrator/components/com_finder/src/Indexer/Parser/Html.php
Normal file
@ -0,0 +1,158 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright (C) 2011 Open Source Matters, Inc. <https://www.joomla.org>
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
namespace Joomla\Component\Finder\Administrator\Indexer\Parser;
|
||||
|
||||
use Joomla\Component\Finder\Administrator\Indexer\Parser;
|
||||
|
||||
// phpcs:disable PSR1.Files.SideEffects
|
||||
\defined('_JEXEC') or die;
|
||||
// phpcs:enable PSR1.Files.SideEffects
|
||||
|
||||
/**
|
||||
* HTML Parser class for the Finder indexer package.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
class Html extends Parser
|
||||
{
|
||||
/**
|
||||
* Method to parse input and extract the plain text. Because this method is
|
||||
* called from both inside and outside the indexer, it needs to be able to
|
||||
* batch out its parsing functionality to deal with the inefficiencies of
|
||||
* regular expressions. We will parse recursively in 2KB chunks.
|
||||
*
|
||||
* @param string $input The input to parse.
|
||||
*
|
||||
* @return string The plain text input.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
public function parse($input)
|
||||
{
|
||||
// Strip invalid UTF-8 characters.
|
||||
$oldSetting = \ini_get('mbstring.substitute_character');
|
||||
ini_set('mbstring.substitute_character', 'none');
|
||||
$input = mb_convert_encoding($input, 'UTF-8', 'UTF-8');
|
||||
ini_set('mbstring.substitute_character', $oldSetting);
|
||||
|
||||
// Remove anything between <head> and </head> tags. Do this first
|
||||
// because there might be <script> or <style> tags nested inside.
|
||||
$input = $this->removeBlocks($input, '<head>', '</head>');
|
||||
|
||||
// Convert <style> and <noscript> tags to <script> tags
|
||||
// so we can remove them efficiently.
|
||||
$search = [
|
||||
'<style', '</style',
|
||||
'<noscript', '</noscript',
|
||||
];
|
||||
$replace = [
|
||||
'<script', '</script',
|
||||
'<script', '</script',
|
||||
];
|
||||
$input = str_replace($search, $replace, $input);
|
||||
|
||||
// Strip all script blocks.
|
||||
$input = $this->removeBlocks($input, '<script', '</script>');
|
||||
|
||||
// Decode HTML entities.
|
||||
$input = html_entity_decode($input, ENT_QUOTES, 'UTF-8');
|
||||
|
||||
// Convert entities equivalent to spaces to actual spaces.
|
||||
$input = str_replace([' ', ' '], ' ', $input);
|
||||
|
||||
// Add a space before both the OPEN and CLOSE tags of BLOCK and LINE BREAKING elements,
|
||||
// e.g. 'all<h1><em>m</em>obile List</h1>' will become 'all mobile List'
|
||||
$input = preg_replace('/(<|<\/)(' .
|
||||
'address|article|aside|blockquote|br|canvas|dd|div|dl|dt|' .
|
||||
'fieldset|figcaption|figure|footer|form|h1|h2|h3|h4|h5|h6|header|hgroup|hr|li|' .
|
||||
'main|nav|noscript|ol|output|p|pre|section|table|tfoot|ul|video' .
|
||||
')\b/i', ' $1$2', $input);
|
||||
|
||||
// Strip HTML tags.
|
||||
$input = strip_tags($input);
|
||||
|
||||
return parent::parse($input);
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to process HTML input and extract the plain text.
|
||||
*
|
||||
* @param string $input The input to process.
|
||||
*
|
||||
* @return string The plain text input.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
protected function process($input)
|
||||
{
|
||||
// Replace any amount of white space with a single space.
|
||||
return preg_replace('#\s+#u', ' ', $input);
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to remove blocks of text between a start and an end tag.
|
||||
* Each block removed is effectively replaced by a single space.
|
||||
*
|
||||
* Note: The start tag and the end tag must be different.
|
||||
* Note: Blocks must not be nested.
|
||||
* Note: This method will function correctly with multi-byte strings.
|
||||
*
|
||||
* @param string $input String to be processed.
|
||||
* @param string $startTag String representing the start tag.
|
||||
* @param string $endTag String representing the end tag.
|
||||
*
|
||||
* @return string with blocks removed.
|
||||
*
|
||||
* @since 3.4
|
||||
*/
|
||||
private function removeBlocks($input, $startTag, $endTag)
|
||||
{
|
||||
$return = '';
|
||||
$offset = 0;
|
||||
$startTagLength = \strlen($startTag);
|
||||
$endTagLength = \strlen($endTag);
|
||||
|
||||
// Find the first start tag.
|
||||
$start = stripos($input, $startTag);
|
||||
|
||||
// If no start tags were found, return the string unchanged.
|
||||
if ($start === false) {
|
||||
return $input;
|
||||
}
|
||||
|
||||
// Look for all blocks defined by the start and end tags.
|
||||
while ($start !== false) {
|
||||
// Accumulate the substring up to the start tag.
|
||||
$return .= substr($input, $offset, $start - $offset) . ' ';
|
||||
|
||||
// Look for an end tag corresponding to the start tag.
|
||||
$end = stripos($input, $endTag, $start + $startTagLength);
|
||||
|
||||
// If no corresponding end tag, leave the string alone.
|
||||
if ($end === false) {
|
||||
// Fix the offset so part of the string is not duplicated.
|
||||
$offset = $start;
|
||||
break;
|
||||
}
|
||||
|
||||
// Advance the start position.
|
||||
$offset = $end + $endTagLength;
|
||||
|
||||
// Look for the next start tag and loop.
|
||||
$start = stripos($input, $startTag, $offset);
|
||||
}
|
||||
|
||||
// Add in the final substring after the last end tag.
|
||||
$return .= substr($input, $offset);
|
||||
|
||||
return $return;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,47 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright (C) 2011 Open Source Matters, Inc. <https://www.joomla.org>
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
namespace Joomla\Component\Finder\Administrator\Indexer\Parser;
|
||||
|
||||
use Joomla\Component\Finder\Administrator\Indexer\Parser;
|
||||
|
||||
// phpcs:disable PSR1.Files.SideEffects
|
||||
\defined('_JEXEC') or die;
|
||||
// phpcs:enable PSR1.Files.SideEffects
|
||||
|
||||
/**
|
||||
* RTF Parser class for the Finder indexer package.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
class Rtf extends Parser
|
||||
{
|
||||
/**
|
||||
* Method to process RTF input and extract the plain text.
|
||||
*
|
||||
* @param string $input The input to process.
|
||||
*
|
||||
* @return string The plain text input.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
protected function process($input)
|
||||
{
|
||||
// Remove embedded pictures.
|
||||
$input = preg_replace('#{\\\pict[^}]*}#mi', '', $input);
|
||||
|
||||
// Remove control characters.
|
||||
$input = str_replace(['{', '}', "\\\n"], [' ', ' ', "\n"], $input);
|
||||
$input = preg_replace('#\\\([^;]+?);#m', ' ', $input);
|
||||
$input = preg_replace('#\\\[\'a-zA-Z0-9]+#mi', ' ', $input);
|
||||
|
||||
return $input;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,39 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @package Joomla.Administrator
|
||||
* @subpackage com_finder
|
||||
*
|
||||
* @copyright (C) 2011 Open Source Matters, Inc. <https://www.joomla.org>
|
||||
* @license GNU General Public License version 2 or later; see LICENSE.txt
|
||||
*/
|
||||
|
||||
namespace Joomla\Component\Finder\Administrator\Indexer\Parser;
|
||||
|
||||
use Joomla\Component\Finder\Administrator\Indexer\Parser;
|
||||
|
||||
// phpcs:disable PSR1.Files.SideEffects
|
||||
\defined('_JEXEC') or die;
|
||||
// phpcs:enable PSR1.Files.SideEffects
|
||||
|
||||
/**
|
||||
* Text Parser class for the Finder indexer package.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
class Txt extends Parser
|
||||
{
|
||||
/**
|
||||
* Method to process Text input and extract the plain text.
|
||||
*
|
||||
* @param string $input The input to process.
|
||||
*
|
||||
* @return string The plain text input.
|
||||
*
|
||||
* @since 2.5
|
||||
*/
|
||||
protected function process($input)
|
||||
{
|
||||
return $input;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user