first commit
This commit is contained in:
21
libraries/vendor/wamania/php-stemmer/LICENSE
vendored
Normal file
21
libraries/vendor/wamania/php-stemmer/LICENSE
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2016 wamania
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
8
libraries/vendor/wamania/php-stemmer/src/NotFoundException.php
vendored
Normal file
8
libraries/vendor/wamania/php-stemmer/src/NotFoundException.php
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
<?php
|
||||
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
class NotFoundException extends \Exception
|
||||
{
|
||||
|
||||
}
|
||||
304
libraries/vendor/wamania/php-stemmer/src/Stemmer/Catalan.php
vendored
Normal file
304
libraries/vendor/wamania/php-stemmer/src/Stemmer/Catalan.php
vendored
Normal file
@ -0,0 +1,304 @@
|
||||
<?php
|
||||
|
||||
namespace Wamania\Snowball\Stemmer;
|
||||
|
||||
use voku\helper\UTF8;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/catalan/stemmer.html
|
||||
* @author Orestes Sanchez Benavente <orestes@estotienearreglo.es>
|
||||
*
|
||||
*
|
||||
* Some fine tuning was necessary in this implementation of the original catalan stemmer algorithm in Snowball:
|
||||
*
|
||||
* 1. Some suffix sets have overlapping items, so here all items are sorted by decreasing size, to
|
||||
* prevent that a shorter suffix will skip a larger one.
|
||||
*
|
||||
* 2. Some alternatives (`or` operator in Snowball) in the original algorithm have
|
||||
* been rearranged to make sure they are applied in the right order.
|
||||
*
|
||||
* Based on the reference Snowball implementation by Israel Olalla of iSOCO
|
||||
*/
|
||||
class Catalan extends Stem
|
||||
{
|
||||
|
||||
/**
|
||||
* All catalan vowels
|
||||
*/
|
||||
protected static $vowels = ['a', 'e', 'i', 'o', 'u', 'á', 'é', 'í', 'ó', 'ú', 'à', 'è', 'ï', 'ò', 'ü'];
|
||||
|
||||
protected static $standard_suffix_1a = [
|
||||
'allengües', 'ativitats', 'bilitats', 'ionistes', 'ialistes', 'ialismes', 'ativitat', 'atòries', 'isament',
|
||||
'bilitat', 'ivitats', 'ionisme', 'ionista', 'ialista', 'ialisme', 'íssimes', 'formes', 'ivisme', 'aments',
|
||||
'nça', 'ificar', 'idores', 'ancies', 'atòria', 'ivitat', 'encies', 'ències', 'atives', 'íssima', 'íssims',
|
||||
'ictes', 'eries', 'itats', 'itzar', 'ament', 'ments', 'sfera', 'ícies', 'àries', 'cions', 'ístic', 'issos',
|
||||
'íssem', 'íssiu', 'issem', 'isseu', 'ísseu', 'dores', 'adura', 'ívola', 'ables', 'adors', 'idors', 'adora',
|
||||
'doras', 'dures', 'ancia', 'toris', 'encia', 'ència', 'ïtats', 'atius', 'ativa', 'ibles', 'asses', 'assos',
|
||||
'íssim', 'ìssem', 'ìsseu', 'ìssin', 'ismes', 'istes', 'inies', 'íinia', 'ínies', 'trius', 'atge', 'icte',
|
||||
'ells', 'ella', 'essa', 'eres', 'ines', 'able', 'itat', 'ives', 'ment', 'amen', 'iste', 'aire', 'eria',
|
||||
'eses', 'esos', 'ícia', 'icis', 'ícis', 'ària', 'alla', 'nces', 'enca', 'issa', 'dora', 'dors', 'bles',
|
||||
'ívol', 'egar', 'ejar', 'itar', 'ació', 'ants', 'tori', 'ions', 'isam', 'ores', 'aris', 'ïtat', 'atiu',
|
||||
'ible', 'assa', 'ents', 'imes', 'isme', 'ista', 'inia', 'ites', 'triu', 'oses', 'osos', 'ient', 'otes',
|
||||
'ell', 'esc', 'ets', 'eta', 'ers', 'ina', 'iva', 'ius', 'fer', 'als', 'era', 'ana', 'esa', 'ici', 'íci',
|
||||
'ció', 'dor', 'all', 'enc', 'osa', 'ble', 'dís', 'dur', 'ant', 'ats', 'ota', 'ors', 'ora', 'ari', 'uts',
|
||||
'uds', 'ent', 'ims', 'ima', 'ita', 'ar', 'és', 'ès', 'et', 'ls', 'ió', 'ot', 'al', 'or', 'il', 'ís', 'ós',
|
||||
'ud', 'ots', 'ó'
|
||||
];
|
||||
|
||||
protected static $attached_pronoun = [
|
||||
'selas', 'selos', '\'hi', '\'ho', '\'ls', '-les', '-nos', '\'ns', 'sela', 'selo', '\'s', '\'l', '-ls', '-la',
|
||||
'-li', 'vos', 'nos', '-us', '\'n', '-ns', '\'m', '-me', '-te', '\'t', 'los', 'las', 'les', 'ens', 'se', 'us',
|
||||
'-n', '-m', 'li', 'lo', 'me', 'le', 'la', 'ho', 'hi'
|
||||
];
|
||||
|
||||
protected static $verb_suffixes = [
|
||||
'aríamos', 'eríamos', 'iríamos', 'eresseu', 'iéramos', 'iésemos', 'adores', 'aríais', 'aremos', 'eríais',
|
||||
'eremos', 'iríais', 'iremos', 'ierais', 'ieseis', 'asteis', 'isteis', 'ábamos', 'áramos', 'ásemos', 'isquen',
|
||||
'esquin', 'esquis', 'esques', 'esquen', 'ïsquen', 'ïsques', 'adora', 'adors', 'arían', 'arías', 'arian',
|
||||
'arien', 'aries', 'aréis', 'erían', 'erías', 'eréis', 'erass', 'irían', 'irías', 'iréis', 'asseu', 'esseu',
|
||||
'àsseu', 'àssem', 'àssim', 'àssiu', 'essen', 'esses', 'assen', 'asses', 'assim', 'assiu', 'éssen', 'ésseu',
|
||||
'éssim', 'éssiu', 'éssem', 'aríem', 'aríeu', 'eixer', 'eixes', 'ieran', 'iesen', 'ieron', 'iendo', 'essin',
|
||||
'essis', 'assin', 'assis', 'essim', 'èssim', 'èssiu', 'ieras', 'ieses', 'abais', 'arais', 'aseis', 'íamos',
|
||||
'irien', 'iries', 'irìem', 'irìeu', 'iguem', 'igueu', 'esqui', 'eixin', 'eixis', 'eixen', 'iríem', 'iríeu',
|
||||
'atges', 'issen', 'isses', 'issin', 'issis', 'issiu', 'issim', 'ïssin', 'íssiu', 'íssim', 'ïssis', 'ïguem',
|
||||
'ïgueu', 'ïssen', 'ïsses', 'itzeu', 'itzis', 'ador', 'ents', 'udes', 'eren', 'arán', 'arás', 'aria', 'aràs',
|
||||
'aría', 'arés', 'erán', 'erás', 'ería', 'erau', 'irán', 'irás', 'iría', 'írem', 'íreu', 'aves', 'avem', 'ávem',
|
||||
'àvem', 'àveu', 'áveu', 'aven', 'ares', 'àrem', 'àreu', 'àren', 'areu', 'aren', 'tzar', 'ides', 'ïdes', 'ades',
|
||||
'iera', 'iese', 'aste', 'iste', 'aban', 'aran', 'asen', 'aron', 'abas', 'adas', 'idas', 'aras', 'ases', 'íais',
|
||||
'ados', 'idos', 'amos', 'imos', 'ques', 'iran', 'irem', 'iren', 'ires', 'ireu', 'iria', 'iràs', 'eixi', 'eixo',
|
||||
'isin', 'isis', 'esca', 'isca', 'ïsca', 'ïren', 'ïres', 'ïxen', 'ïxes', 'ixen', 'ixes', 'inin', 'inis', 'ineu',
|
||||
'itza', 'itzi', 'itzo', 'itzà', 'arem', 'ent', 'arà', 'ará', 'ara', 'aré', 'erá', 'eré', 'irá', 'iré', 'íeu',
|
||||
'ies', 'íem', 'ìeu', 'ien', 'uda', 'ava', 'ats', 'ant', 'ïen', 'ams', 'ïes', 'dre', 'eix', 'ïda', 'aba', 'ada',
|
||||
'ida', 'its', 'ids', 'ase', 'ían', 'ado', 'ido', 'ieu', 'ess', 'ass', 'ías', 'áis', 'ira', 'irà', 'irè', 'sis',
|
||||
'sin', 'int', 'isc', 'ïsc', 'ïra', 'ïxo', 'ixo', 'ixa', 'ini', 'itz', 'iïn', 're', 'ie', 'er', 'ia', 'at', 'ut',
|
||||
'au', 'ïm', 'ïu', 'és', 'en', 'es', 'em', 'am', 'ïa', 'it', 'ït', 'ía', 'ad', 'ed', 'id', 'an', 'ió', 'ar',
|
||||
'ir', 'as', 'ii', 'io', 'ià', 'ís', 'ïx', 'ix', 'in', 'às', 'iï', 'iïs', 'í'
|
||||
];
|
||||
|
||||
protected static $residual_suffixes = [
|
||||
'itz', 'it', 'os', 'eu', 'iu', 'is', 'ir', 'ïn', 'ïs', 'a', 'o', 'á', 'à', 'í', 'ó', 'e', 'é', 'i', 's', 'ì',
|
||||
'ï'
|
||||
];
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (!UTF8::is_utf8($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = UTF8::strtolower($word);
|
||||
|
||||
// Catalan stemmer does not use Rv
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
|
||||
// Step 0: Attached pronoun
|
||||
$this->step0();
|
||||
|
||||
$word = $this->word;
|
||||
// Step 1a: Standard suffix
|
||||
$this->step1a();
|
||||
|
||||
// Step 1b: Verb suffix
|
||||
// Do step 1b if no ending was removed by step 1a.
|
||||
if ($this->word == $word) {
|
||||
$this->step1b();
|
||||
}
|
||||
|
||||
$this->step2();
|
||||
$this->finish();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 0: Attached pronoun
|
||||
*
|
||||
* Search for the longest among the following suffixes
|
||||
* and delete it in R1.
|
||||
*/
|
||||
|
||||
private function step0()
|
||||
{
|
||||
if (($position = $this->search(static::$attached_pronoun)) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1a: Standard suffix
|
||||
*/
|
||||
private function step1a()
|
||||
{
|
||||
// Run step 1a.2 before 1a.1, since they overlap on `cions` (1a.1) and `acions` (1a.2)
|
||||
//
|
||||
// Step 1a.2.
|
||||
// acions ada ades
|
||||
// delete if in R2
|
||||
if (($position = $this->search(['acions', 'ada', 'ades'])) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Step 1a.1.
|
||||
// ar atge formes icte ictes ell ells ella és ès esc essa et ets eta eres eries ers ina ines able ls ió itat
|
||||
// itats itzar iva ives ivisme ius fer ment amen ament aments ments ot sfera al als era ana iste aire eria esa
|
||||
// eses esos or ícia ícies icis ici íci ícis ària àries alla ció cions n{c}a nces ó dor all il ístic enc enca
|
||||
// ís issa issos íssem íssiu issem isseu ísseu ós osa dora dores dors adura ble bles ívol ívola dís egar ejar
|
||||
// ificar itar ables adors idores idors adora ació doras dur dures alleng{u"}es ant ants ancia ancies atòria
|
||||
// atòries tori toris ats ions ota isam ors ora ores isament bilitat bilitats ivitat ivitats ari aris ionisme
|
||||
// ionista ionistes ialista ialistes ialisme ialismes ud uts uds encia encies ència ències ïtat ïtats atiu
|
||||
// atius atives ativa ativitat ativitats ible ibles assa asses assos ent ents íssim íssima íssims íssimes
|
||||
// ìssem ìsseu ìssin ims ima imes isme ista ismes istes inia inies íinia ínies ita ites triu trius oses osos
|
||||
// ient otes ots
|
||||
//
|
||||
// delete if in R1
|
||||
if (($position = $this->search(self::$standard_suffix_1a)) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Step 1a.3.
|
||||
// logía logíes logia logies logi logis lógica lógics lógiques
|
||||
// replace with log if in R2
|
||||
if (($position = $this->search(
|
||||
['logía', 'logíes', 'logia', 'logies', 'logis', 'lógica', 'lógics', 'lógiques', 'logi']
|
||||
)) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace(
|
||||
'#(logía|logíes|logia|logies|logis|lógica|lógics|lógiques|logi)$#u', 'log', $this->word
|
||||
);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Step 1a.4.
|
||||
// ic ica ics iques
|
||||
// replace with ic if in R2
|
||||
if (($position = $this->search(['ics', 'ica', 'iques', 'ic'])) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(ics|ica|iques|ic)$#u', 'ic', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Step 1a.5.
|
||||
// quíssims quíssimes quíssima quíssim
|
||||
// replace with c if in R1
|
||||
if (($position = $this->search(['quíssima', 'quíssims', 'quíssimes', 'quíssim'])) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(quíssima|quíssims|quíssimes|quíssim)$#u', 'c', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1b: Verb suffixes
|
||||
* Search for the longest among the following suffixes in r1 and r2, and
|
||||
* perform the action indicated.
|
||||
*/
|
||||
private function step1b()
|
||||
{
|
||||
// Step 1b.1
|
||||
//
|
||||
// aríamos eríamos iríamos eresseu iéramos iésemos adores aríais aremos eríais
|
||||
// eremos iríais iremos ierais ieseis asteis isteis ábamos áramos ásemos isquen
|
||||
// esquin esquis esques esquen ïsquen ïsques adora adors arían arías arian
|
||||
// arien aries aréis erían erías eréis erass irían irías iréis asseu esseu
|
||||
// àsseu àssem àssim àssiu essen esses assen asses assim assiu éssen ésseu
|
||||
// éssim éssiu éssem aríem aríeu eixer eixes ieran iesen ieron iendo essin
|
||||
// essis assin assis essim èssim èssiu ieras ieses abais arais aseis íamos
|
||||
// irien iries irìem irìeu iguem igueu esqui eixin eixis eixen iríem iríeu
|
||||
// atges issen isses issin issis issiu issim ïssin íssiu íssim ïssis ïguem
|
||||
// ïgueu ïssen ïsses itzeu itzis ador ents udes eren arán arás aria aràs
|
||||
// aría arés erán erás ería erau irán irás iría írem íreu aves avem ávem
|
||||
// àvem àveu áveu aven ares àrem àreu àren areu aren tzar ides ïdes ades
|
||||
// iera iese aste iste aban aran asen aron abas adas idas aras ases íais
|
||||
// ados idos amos imos ques iran irem iren ires ireu iria iràs eixi eixo
|
||||
// isin isis esca isca ïsca ïren ïres ïxen ïxes ixen ixes inin inis ineu
|
||||
// itza itzi itzo itzà arem ent arà ará ara aré erá eré irá iré íeu
|
||||
// ies íem ìeu ien uda ava ats ant ïen ams ïes dre eix ïda aba ada
|
||||
// ida its ids ase ían ado ido ieu ess ass ías áis ira irà irè sis
|
||||
// sin int isc ïsc ïra ïxo ixo ixa ini itz iïn re ie er ia at ut
|
||||
// au ïm ïu és en es em am ïa it ït ía ad ed id an ió ar
|
||||
// ir as ii io ià ís ïx ix in às iï iïs í
|
||||
// delete if in R1
|
||||
if (($position = $this->search(static::$verb_suffixes)) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Step 1b.2
|
||||
// ando
|
||||
// delete if in R2
|
||||
if (($position = $this->search(['ando'])) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2: residual suffix
|
||||
* Search for the longest among the following suffixes in R1, and perform
|
||||
* the action indicated.
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
// Step 2.1
|
||||
// residual suffix
|
||||
// delete if in R1
|
||||
if (($position = $this->search(static::$residual_suffixes)) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Step 2.2
|
||||
// iqu
|
||||
// replace with ic if in R1
|
||||
if (($position = $this->search(['iqu'])) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(iqu)$#u', 'ic', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* And finally:
|
||||
* Remove accents and l aggeminades
|
||||
*/
|
||||
private function finish()
|
||||
{
|
||||
$this->word = UTF8::str_replace(
|
||||
['á', 'é', 'í', 'ó', 'ú', 'à', 'è', 'ì', 'ò', 'ï', 'ü', '·'],
|
||||
['a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'i', 'u', '.'],
|
||||
$this->word
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
152
libraries/vendor/wamania/php-stemmer/src/Stemmer/Danish.php
vendored
Normal file
152
libraries/vendor/wamania/php-stemmer/src/Stemmer/Danish.php
vendored
Normal file
@ -0,0 +1,152 @@
|
||||
<?php
|
||||
|
||||
namespace Wamania\Snowball\Stemmer;
|
||||
|
||||
use voku\helper\UTF8;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/danish/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Danish extends Stem
|
||||
{
|
||||
/**
|
||||
* All danish vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'æ', 'å', 'ø');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word): string
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (!UTF8::is_utf8($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = UTF8::strtolower($word);
|
||||
|
||||
// R2 is not used: R1 is defined in the same way as in the German stemmer
|
||||
$this->r1();
|
||||
|
||||
// then R1 is adjusted so that the region before it contains at least 3 letters.
|
||||
if ($this->r1Index < 3) {
|
||||
$this->r1Index = 3;
|
||||
$this->r1 = UTF8::substr($this->word, 3);
|
||||
}
|
||||
|
||||
// Do each of steps 1, 2 3 and 4.
|
||||
$this->step1();
|
||||
$this->step2();
|
||||
$this->step3();
|
||||
$this->step4();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Define a valid s-ending as one of
|
||||
* a b c d f g h j k l m n o p r t v y z å
|
||||
*
|
||||
* @param string $ending
|
||||
* @return boolean
|
||||
*/
|
||||
private function hasValidSEnding($word)
|
||||
{
|
||||
$lastLetter = UTF8::substr($word, -1, 1);
|
||||
return in_array($lastLetter, array('a', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z', 'å'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1
|
||||
* Search for the longest among the following suffixes in R1, and perform the action indicated.
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// hed ethed ered e erede ende erende ene erne ere en heden eren er heder erer
|
||||
// heds es endes erendes enes ernes eres ens hedens erens ers ets erets et eret
|
||||
// delete
|
||||
if ( ($position = $this->searchIfInR1(array(
|
||||
'erendes', 'erende', 'hedens', 'erede', 'ethed', 'heden', 'endes', 'erets', 'heder', 'ernes',
|
||||
'erens', 'ered', 'ende', 'erne', 'eres', 'eren', 'eret', 'erer', 'enes', 'heds',
|
||||
'ens', 'ene', 'ere', 'ers', 'ets', 'hed', 'es', 'et', 'er', 'en', 'e'
|
||||
))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// s
|
||||
// delete if preceded by a valid s-ending
|
||||
if ( ($position = $this->searchIfInR1(array('s'))) !== false) {
|
||||
$word = UTF8::substr($this->word, 0, $position);
|
||||
if ($this->hasValidSEnding($word)) {
|
||||
$this->word = $word;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2
|
||||
* Search for one of the following suffixes in R1, and if found delete the last letter.
|
||||
* gd dt gt kt
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
if ($this->searchIfInR1(array('gd', 'dt', 'gt', 'kt')) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3:
|
||||
*/
|
||||
private function step3()
|
||||
{
|
||||
// If the word ends igst, remove the final st.
|
||||
if ($this->search(array('igst')) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, -2);
|
||||
}
|
||||
|
||||
// Search for the longest among the following suffixes in R1, and perform the action indicated.
|
||||
// ig lig elig els
|
||||
// delete, and then repeat step 2
|
||||
if ( ($position = $this->searchIfInR1(array('elig', 'lig', 'ig', 'els'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
$this->step2();
|
||||
return true;
|
||||
}
|
||||
|
||||
// løst
|
||||
// replace with løs
|
||||
if ($this->searchIfInR1(array('løst')) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 4: undouble
|
||||
* If the word ends with double consonant in R1, remove one of the consonants.
|
||||
*/
|
||||
private function step4()
|
||||
{
|
||||
$length = UTF8::strlen($this->word);
|
||||
if (!$this->inR1(($length-1))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$lastLetter = UTF8::substr($this->word, -1, 1);
|
||||
if (in_array($lastLetter, self::$vowels)) {
|
||||
return false;
|
||||
}
|
||||
$beforeLastLetter = UTF8::substr($this->word, -2, 1);
|
||||
|
||||
if ($lastLetter == $beforeLastLetter) {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
306
libraries/vendor/wamania/php-stemmer/src/Stemmer/Dutch.php
vendored
Normal file
306
libraries/vendor/wamania/php-stemmer/src/Stemmer/Dutch.php
vendored
Normal file
@ -0,0 +1,306 @@
|
||||
<?php
|
||||
|
||||
namespace Wamania\Snowball\Stemmer;
|
||||
|
||||
use voku\helper\UTF8;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/dutch/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Dutch extends Stem
|
||||
{
|
||||
/**
|
||||
* All dutch vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'è');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (!UTF8::is_utf8($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = UTF8::strtolower($word);
|
||||
|
||||
// First, remove all umlaut and acute accents.
|
||||
$this->word = UTF8::str_replace(
|
||||
array('ä', 'ë', 'ï', 'ö', 'ü', 'á', 'é', 'í', 'ó', 'ú'),
|
||||
array('a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u'),
|
||||
$this->word);
|
||||
|
||||
$this->plainVowels = implode('', self::$vowels);
|
||||
|
||||
// Put initial y, y after a vowel, and i between vowels into upper case.
|
||||
$this->word = preg_replace('#^y#u', 'Y', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word);
|
||||
|
||||
// R1 and R2 (see the note on R1 and R2) are then defined as in German.
|
||||
// R1 and R2 are first set up in the standard way
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
|
||||
// but then R1 is adjusted so that the region before it contains at least 3 letters.
|
||||
if ($this->r1Index < 3) {
|
||||
$this->r1Index = 3;
|
||||
$this->r1 = UTF8::substr($this->word, 3);
|
||||
}
|
||||
|
||||
// Do each of steps 1, 2 3 and 4.
|
||||
$this->step1();
|
||||
$removedE = $this->step2();
|
||||
$this->step3a();
|
||||
$this->step3b($removedE);
|
||||
$this->step4();
|
||||
$this->finish();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Define a valid s-ending as a non-vowel other than j.
|
||||
* @param string $ending
|
||||
* @return boolean
|
||||
*/
|
||||
private function hasValidSEnding($word)
|
||||
{
|
||||
$lastLetter = UTF8::substr($word, -1, 1);
|
||||
return !in_array($lastLetter, array_merge(self::$vowels, array('j')));
|
||||
}
|
||||
|
||||
/**
|
||||
* Define a valid en-ending as a non-vowel, and not gem.
|
||||
* @param string $ending
|
||||
* @return boolean
|
||||
*/
|
||||
private function hasValidEnEnding($word)
|
||||
{
|
||||
$lastLetter = UTF8::substr($word, -1, 1);
|
||||
if (in_array($lastLetter, self::$vowels)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$threeLastLetters = UTF8::substr($word, -3, 3);
|
||||
if ($threeLastLetters == 'gem') {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Define undoubling the ending as removing the last letter if the word ends kk, dd or tt.
|
||||
*/
|
||||
private function unDoubling()
|
||||
{
|
||||
if ($this->search(array('kk', 'dd', 'tt')) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1
|
||||
* Search for the longest among the following suffixes, and perform the action indicated
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// heden
|
||||
// replace with heid if in R1
|
||||
if ( ($position = $this->search(array('heden'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(heden)$#u', 'heid', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// en ene
|
||||
// delete if in R1 and preceded by a valid en-ending, and then undouble the ending
|
||||
if ( ($position = $this->search(array('ene', 'en'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$word = UTF8::substr($this->word, 0, $position);
|
||||
if ($this->hasValidEnEnding($word)) {
|
||||
$this->word = $word;
|
||||
$this->unDoubling();
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// s se
|
||||
// delete if in R1 and preceded by a valid s-ending
|
||||
if ( ($position = $this->search(array('se', 's'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$word = UTF8::substr($this->word, 0, $position);
|
||||
if ($this->hasValidSEnding($word)) {
|
||||
$this->word = $word;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2
|
||||
* Delete suffix e if in R1 and preceded by a non-vowel, and then undouble the ending
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
if ( ($position = $this->search(array('e'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$letter = UTF8::substr($this->word, -2, 1);
|
||||
if (!in_array($letter, self::$vowels)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
$this->unDoubling();
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3a: heid
|
||||
* delete heid if in R2 and not preceded by c, and treat a preceding en as in step 1(b)
|
||||
*/
|
||||
private function step3a()
|
||||
{
|
||||
if ( ($position = $this->search(array('heid'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$letter = UTF8::substr($this->word, -5, 1);
|
||||
if ($letter !== 'c') {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
|
||||
if ( ($position = $this->search(array('en'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$word = UTF8::substr($this->word, 0, $position);
|
||||
if ($this->hasValidEnEnding($word)) {
|
||||
$this->word = $word;
|
||||
$this->unDoubling();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3b: d-suffixe
|
||||
* Search for the longest among the following suffixes, and perform the action indicated.
|
||||
*/
|
||||
private function step3b($removedE)
|
||||
{
|
||||
// end ing
|
||||
// delete if in R2
|
||||
// if preceded by ig, delete if in R2 and not preceded by e, otherwise undouble the ending
|
||||
if ( ($position = $this->search(array('end', 'ing'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
|
||||
if ( ($position2 = $this->searchIfInR2(array('ig'))) !== false) {
|
||||
$letter = UTF8::substr($this->word, -3, 1);
|
||||
if ($letter !== 'e') {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
}
|
||||
} else {
|
||||
$this->unDoubling();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// ig
|
||||
// delete if in R2 and not preceded by e
|
||||
if ( ($position = $this->search(array('ig'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$letter = UTF8::substr($this->word, -3, 1);
|
||||
if ($letter !== 'e') {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// lijk
|
||||
// delete if in R2, and then repeat step 2
|
||||
if ( ($position = $this->search(array('lijk'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
$this->step2();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// baar
|
||||
// delete if in R2
|
||||
if ( ($position = $this->search(array('baar'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// bar
|
||||
// delete if in R2 and if step 2 actually removed an e
|
||||
if ( ($position = $this->search(array('bar'))) !== false) {
|
||||
if ($this->inR2($position) && $removedE) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 4: undouble vowel
|
||||
* If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u,
|
||||
* remove one of the vowels from V (for example, maan -> man, brood -> brod).
|
||||
*/
|
||||
private function step4()
|
||||
{
|
||||
// D is a non-vowel other than I
|
||||
$d = UTF8::substr($this->word, -1, 1);
|
||||
if (in_array($d, array_merge(self::$vowels, array('I')))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// V is double a, e, o or u
|
||||
$v = UTF8::substr($this->word, -3, 2);
|
||||
if (!in_array($v, array('aa', 'ee', 'oo', 'uu'))) {
|
||||
return false;
|
||||
}
|
||||
$singleV = UTF8::substr($v, 0, 1);
|
||||
|
||||
// C is a non-vowel
|
||||
$c = UTF8::substr($this->word, -4, 1);
|
||||
if (in_array($c, self::$vowels)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$this->word = UTF8::substr($this->word, 0, -4);
|
||||
$this->word .= $c . $singleV .$d;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finally
|
||||
* Turn I and Y back into lower case.
|
||||
*/
|
||||
private function finish()
|
||||
{
|
||||
$this->word = UTF8::str_replace(array('I', 'Y'), array('i', 'y'), $this->word);
|
||||
}
|
||||
}
|
||||
602
libraries/vendor/wamania/php-stemmer/src/Stemmer/English.php
vendored
Normal file
602
libraries/vendor/wamania/php-stemmer/src/Stemmer/English.php
vendored
Normal file
@ -0,0 +1,602 @@
|
||||
<?php
|
||||
|
||||
namespace Wamania\Snowball\Stemmer;
|
||||
|
||||
use voku\helper\UTF8;
|
||||
|
||||
/**
|
||||
* English Porter 2
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/english/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class English extends Stem
|
||||
{
|
||||
/**
|
||||
* All english vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y');
|
||||
|
||||
protected static $doubles = array('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt');
|
||||
|
||||
protected static $liEnding = array('c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (!UTF8::is_utf8($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
if (Utf8::strlen($word) < 3) {
|
||||
return $word;
|
||||
}
|
||||
|
||||
$this->word = UTF8::strtolower($word);
|
||||
|
||||
// exceptions
|
||||
if (null !== ($word = $this->exception1())) {
|
||||
return $word;
|
||||
}
|
||||
|
||||
|
||||
$this->plainVowels = implode('', self::$vowels);
|
||||
|
||||
// Remove initial ', if present.
|
||||
$first = UTF8::substr($this->word, 0, 1);
|
||||
if ($first == "'") {
|
||||
$this->word = UTF8::substr($this->word, 1);
|
||||
}
|
||||
|
||||
// Set initial y, or y after a vowel, to Y
|
||||
if ($first == 'y') {
|
||||
$this->word = preg_replace('#^y#u', 'Y', $this->word);
|
||||
}
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word);
|
||||
|
||||
$this->r1();
|
||||
$this->exceptionR1();
|
||||
$this->r2();
|
||||
|
||||
$this->step0();
|
||||
$this->step1a();
|
||||
|
||||
// exceptions 2
|
||||
if (null !== ($word = $this->exception2())) {
|
||||
return $word;
|
||||
}
|
||||
|
||||
$this->step1b();
|
||||
$this->step1c();
|
||||
$this->step2();
|
||||
$this->step3();
|
||||
$this->step4();
|
||||
$this->step5();
|
||||
$this->finish();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 0
|
||||
* Remove ', 's, 's'
|
||||
*/
|
||||
private function step0()
|
||||
{
|
||||
if ( ($position = $this->search(array("'s'", "'s", "'"))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
|
||||
private function step1a()
|
||||
{
|
||||
// sses
|
||||
// replace by ss
|
||||
if ( ($position = $this->search(array('sses'))) !== false) {
|
||||
$this->word = preg_replace('#(sses)$#u', 'ss', $this->word);
|
||||
return true;
|
||||
}
|
||||
|
||||
// ied+ ies*
|
||||
// replace by i if preceded by more than one letter, otherwise by ie (so ties -> tie, cries -> cri)
|
||||
if ( ($position = $this->search(array('ied', 'ies'))) !== false) {
|
||||
if ($position > 1) {
|
||||
$this->word = preg_replace('#(ied|ies)$#u', 'i', $this->word);
|
||||
|
||||
} else {
|
||||
$this->word = preg_replace('#(ied|ies)$#u', 'ie', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// us+ ss
|
||||
// do nothing
|
||||
if ( ($position = $this->search(array('us', 'ss'))) !== false) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// s
|
||||
// delete if the preceding word part contains a vowel not immediately before the s (so gas and this retain the s, gaps and kiwis lose it)
|
||||
if ( ($position = $this->search(array('s'))) !== false) {
|
||||
for ($i=0; $i<$position-1; $i++) {
|
||||
$letter = UTF8::substr($this->word, $i, 1);
|
||||
|
||||
if (in_array($letter, self::$vowels)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1b
|
||||
*/
|
||||
private function step1b()
|
||||
{
|
||||
// eed eedly+
|
||||
// replace by ee if in R1
|
||||
if ( ($position = $this->search(array('eedly', 'eed'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(eedly|eed)$#u', 'ee', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ed edly+ ing ingly+
|
||||
// delete if the preceding word part contains a vowel, and after the deletion:
|
||||
// if the word ends at, bl or iz add e (so luxuriat -> luxuriate), or
|
||||
// if the word ends with a double remove the last letter (so hopp -> hop), or
|
||||
// if the word is short, add e (so hop -> hope)
|
||||
if ( ($position = $this->search(array('edly', 'ingly', 'ed', 'ing'))) !== false) {
|
||||
for ($i=0; $i<$position; $i++) {
|
||||
$letter = UTF8::substr($this->word, $i, 1);
|
||||
|
||||
if (in_array($letter, self::$vowels)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
|
||||
if ($this->search(array('at', 'bl', 'iz')) !== false) {
|
||||
$this->word .= 'e';
|
||||
|
||||
} elseif ( ($position2 = $this->search(self::$doubles)) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, ($position2+1));
|
||||
|
||||
} elseif ($this->isShort()) {
|
||||
$this->word .= 'e';
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1c: *
|
||||
*/
|
||||
private function step1c()
|
||||
{
|
||||
// replace suffix y or Y by i if preceded by a non-vowel
|
||||
// which is not the first letter of the word (so cry -> cri, by -> by, say -> say)
|
||||
$length = UTF8::strlen($this->word);
|
||||
|
||||
if ($length < 3) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( ($position = $this->search(array('y', 'Y'))) !== false) {
|
||||
$before = $position - 1;
|
||||
$letter = UTF8::substr($this->word, $before, 1);
|
||||
|
||||
if (! in_array($letter, self::$vowels)) {
|
||||
$this->word = preg_replace('#(y|Y)$#u', 'i', $this->word);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2
|
||||
* Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated.
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
// iveness iviti: replace by ive
|
||||
if ( ($position = $this->search(array('iveness', 'iviti'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(iveness|iviti)$#u', 'ive', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ousli ousness: replace by ous
|
||||
if ( ($position = $this->search(array('ousli', 'ousness'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(ousli|ousness)$#u', 'ous', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// izer ization: replace by ize
|
||||
if ( ($position = $this->search(array('izer', 'ization'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(izer|ization)$#u', 'ize', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ational ation ator: replace by ate
|
||||
if ( ($position = $this->search(array('ational', 'ation', 'ator'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(ational|ation|ator)$#u', 'ate', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// biliti bli+: replace by ble
|
||||
if ( ($position = $this->search(array('biliti', 'bli'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(biliti|bli)$#u', 'ble', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// lessli+: replace by less
|
||||
if ( ($position = $this->search(array('lessli'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(lessli)$#u', 'less', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// fulness: replace by ful
|
||||
if ( ($position = $this->search(array('fulness', 'fulli'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(fulness|fulli)$#u', 'ful', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// tional: replace by tion
|
||||
if ( ($position = $this->search(array('tional'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(tional)$#u', 'tion', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// alism aliti alli: replace by al
|
||||
if ( ($position = $this->search(array('alism', 'aliti', 'alli'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(alism|aliti|alli)$#u', 'al', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// enci: replace by ence
|
||||
if ( ($position = $this->search(array('enci'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(enci)$#u', 'ence', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// anci: replace by ance
|
||||
if ( ($position = $this->search(array('anci'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(anci)$#u', 'ance', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// abli: replace by able
|
||||
if ( ($position = $this->search(array('abli'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(abli)$#u', 'able', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// entli: replace by ent
|
||||
if ( ($position = $this->search(array('entli'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(entli)$#u', 'ent', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ogi+: replace by og if preceded by l
|
||||
if ( ($position = $this->search(array('ogi'))) !== false) {
|
||||
|
||||
if ($this->inR1($position)) {
|
||||
$before = $position - 1;
|
||||
$letter = UTF8::substr($this->word, $before, 1);
|
||||
|
||||
if ($letter == 'l') {
|
||||
$this->word = preg_replace('#(ogi)$#u', 'og', $this->word);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// li+: delete if preceded by a valid li-ending
|
||||
if ( ($position = $this->search(array('li'))) !== false) {
|
||||
|
||||
if ($this->inR1($position)) {
|
||||
// a letter for you
|
||||
$letter = UTF8::substr($this->word, ($position-1), 1);
|
||||
|
||||
if (in_array($letter, self::$liEnding)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3:
|
||||
* Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated.
|
||||
*/
|
||||
private function step3()
|
||||
{
|
||||
// ational+: replace by ate
|
||||
if ($this->searchIfInR1(array('ational')) !== false) {
|
||||
$this->word = preg_replace('#(ational)$#u', 'ate', $this->word);
|
||||
return true;
|
||||
}
|
||||
|
||||
// tional+: replace by tion
|
||||
if ($this->searchIfInR1(array('tional')) !== false) {
|
||||
$this->word = preg_replace('#(tional)$#u', 'tion', $this->word);
|
||||
return true;
|
||||
}
|
||||
|
||||
// alize: replace by al
|
||||
if ($this->searchIfInR1(array('alize')) !== false) {
|
||||
$this->word = preg_replace('#(alize)$#u', 'al', $this->word);
|
||||
return true;
|
||||
}
|
||||
|
||||
// icate iciti ical: replace by ic
|
||||
if ($this->searchIfInR1(array('icate', 'iciti', 'ical')) !== false) {
|
||||
$this->word = preg_replace('#(icate|iciti|ical)$#u', 'ic', $this->word);
|
||||
return true;
|
||||
}
|
||||
|
||||
// ful ness: delete
|
||||
if ( ($position = $this->searchIfInR1(array('ful', 'ness'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// ative*: delete if in R2
|
||||
if ( (($position = $this->searchIfInR1(array('ative'))) !== false) && ($this->inR2($position)) ) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 4
|
||||
* Search for the longest among the following suffixes, and, if found and in R2, perform the action indicated.
|
||||
*/
|
||||
private function step4()
|
||||
{
|
||||
// ement ance ence able ible ant ment ent ism ate iti ous ive ize al er ic
|
||||
// delete
|
||||
if ( ($position = $this->search(array(
|
||||
'ance', 'ence', 'ement', 'able', 'ible', 'ant', 'ment', 'ent', 'ism',
|
||||
'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er', 'ic'))) !== false) {
|
||||
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ion
|
||||
// delete if preceded by s or t
|
||||
if ( ($position = $this->searchIfInR2(array('ion'))) !== false) {
|
||||
$before = $position - 1;
|
||||
$letter = UTF8::substr($this->word, $before, 1);
|
||||
|
||||
if ($letter == 's' || $letter == 't') {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 5: *
|
||||
* Search for the the following suffixes, and, if found, perform the action indicated.
|
||||
*/
|
||||
private function step5()
|
||||
{
|
||||
// e
|
||||
// delete if in R2, or in R1 and not preceded by a short syllable
|
||||
if ( ($position = $this->search(array('e'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
|
||||
} elseif ($this->inR1($position)) {
|
||||
if ( (! $this->searchShortSyllabe(-4, 3)) && (! $this->searchShortSyllabe(-3, 2)) ) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// l
|
||||
// delete if in R2 and preceded by l
|
||||
if ( ($position = $this->searchIfInR2(array('l'))) !== false) {
|
||||
$before = $position - 1;
|
||||
$letter = UTF8::substr($this->word, $before, 1);
|
||||
|
||||
if ($letter == 'l') {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private function finish()
|
||||
{
|
||||
$this->word = UTF8::str_replace('Y', 'y', $this->word);
|
||||
}
|
||||
|
||||
private function exceptionR1()
|
||||
{
|
||||
if (Utf8::strpos($this->word, 'gener') === 0) {
|
||||
$this->r1 = UTF8::substr($this->word, 5);
|
||||
$this->r1Index = 5;
|
||||
|
||||
} elseif (Utf8::strpos($this->word, 'commun') === 0) {
|
||||
$this->r1 = UTF8::substr($this->word, 6);
|
||||
$this->r1Index = 6;
|
||||
|
||||
} elseif (Utf8::strpos($this->word, 'arsen') === 0) {
|
||||
$this->r1 = UTF8::substr($this->word, 5);
|
||||
$this->r1Index = 5;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 1/ Stem certain special words as follows,
|
||||
* 2/ If one of the following is found, leave it invariant,
|
||||
*/
|
||||
private function exception1()
|
||||
{
|
||||
$exceptions = array(
|
||||
'skis' => 'ski',
|
||||
'skies' => 'sky',
|
||||
'dying' => 'die',
|
||||
'lying' => 'lie',
|
||||
'tying' => 'tie',
|
||||
'idly' => 'idl',
|
||||
'gently' => 'gentl',
|
||||
'ugly' => 'ugli',
|
||||
'early' => 'earli',
|
||||
'only' => 'onli',
|
||||
'singly' => 'singl',
|
||||
// invariants
|
||||
'sky' => 'sky',
|
||||
'news' => 'news',
|
||||
'howe' => 'howe',
|
||||
'atlas' => 'atlas',
|
||||
'cosmos' => 'cosmos',
|
||||
'bias' => 'bias',
|
||||
'andes' => 'andes'
|
||||
);
|
||||
|
||||
if (isset($exceptions[$this->word])) {
|
||||
return $exceptions[$this->word];
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Following step 1a, leave the following invariant,
|
||||
*/
|
||||
private function exception2()
|
||||
{
|
||||
$exceptions = array(
|
||||
'inning' => 'inning',
|
||||
'outing' => 'outing',
|
||||
'canning' => 'canning',
|
||||
'herring' => 'herring',
|
||||
'earring' => 'earring',
|
||||
'proceed' => 'proceed',
|
||||
'exceed' => 'exceed',
|
||||
'succeed' => 'succeed'
|
||||
);
|
||||
|
||||
if (isset($exceptions[$this->word])) {
|
||||
return $exceptions[$this->word];
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* A word is called short if it ends in a short syllable, and if R1 is null.
|
||||
* Note : R1 not really null, but the word at this state must be smaller than r1 index
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
private function isShort()
|
||||
{
|
||||
$length = UTF8::strlen($this->word);
|
||||
return ( ($this->searchShortSyllabe(-3, 3) || $this->searchShortSyllabe(-2, 2)) && ($length == $this->r1Index) );
|
||||
}
|
||||
|
||||
/**
|
||||
* Define a short syllable in a word as either (a) a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel,
|
||||
* or * (b) a vowel at the beginning of the word followed by a non-vowel.
|
||||
*
|
||||
* So rap, trap, entrap end with a short syllable, and ow, on, at are classed as short syllables.
|
||||
* But uproot, bestow, disturb do not end with a short syllable.
|
||||
*/
|
||||
private function searchShortSyllabe($from, $nbLetters)
|
||||
{
|
||||
$length = UTF8::strlen($this->word);
|
||||
|
||||
if ($from < 0) {
|
||||
$from = $length + $from;
|
||||
}
|
||||
if ($from < 0) {
|
||||
$from = 0;
|
||||
}
|
||||
|
||||
// (a) is just for beginning of the word
|
||||
if ( ($nbLetters == 2) && ($from != 0) ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$first = UTF8::substr($this->word, $from, 1);
|
||||
$second = UTF8::substr($this->word, ($from+1), 1);
|
||||
|
||||
if ($nbLetters == 2) {
|
||||
if ( (in_array($first, self::$vowels)) && (!in_array($second, self::$vowels)) ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
$third = UTF8::substr($this->word, ($from+2), 1);
|
||||
|
||||
if ( (!in_array($first, self::$vowels)) && (in_array($second, self::$vowels))
|
||||
&& (!in_array($third, array_merge(self::$vowels, array('x', 'Y', 'w'))))) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
444
libraries/vendor/wamania/php-stemmer/src/Stemmer/Finnish.php
vendored
Normal file
444
libraries/vendor/wamania/php-stemmer/src/Stemmer/Finnish.php
vendored
Normal file
@ -0,0 +1,444 @@
|
||||
<?php
|
||||
/**
|
||||
* Finnish Snowball Stemmer.
|
||||
*
|
||||
* @author msaari <mikko@mikkosaari.fi>
|
||||
*/
|
||||
namespace Wamania\Snowball\Stemmer;
|
||||
|
||||
use voku\helper\UTF8;
|
||||
|
||||
/**
|
||||
* Finnish Snowball Stemmer.
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/finnish/stemmer.html
|
||||
* @author msaari
|
||||
*/
|
||||
class Finnish extends Stem
|
||||
{
|
||||
/**
|
||||
* All swedish vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö');
|
||||
|
||||
protected static $consonants = array('b', 'c', 'd', 'f', 'g', 'h', 'j',
|
||||
'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z');
|
||||
|
||||
protected static $restrictedVowels = array('a', 'e', 'i', 'o', 'u', 'ä', 'ö');
|
||||
|
||||
/**
|
||||
* Long restricted vowels, ie. doubled vowels.
|
||||
*/
|
||||
protected static $longVowels = array('aa', 'ee', 'ii', 'oo', 'uu', 'ää', 'öö');
|
||||
|
||||
private $_removedInStep3 = false;
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (! UTF8::is_utf8($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = Utf8::strtolower($word);
|
||||
|
||||
// R1 and R2 are then defined in the usual way
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
|
||||
// Do each of steps 1, 2 3, 4, 5 and 6.
|
||||
|
||||
$this->step1();
|
||||
$this->step2();
|
||||
$this->step3();
|
||||
$this->step4();
|
||||
$this->step5();
|
||||
$this->step6();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1
|
||||
*
|
||||
* Search for the longest among the following suffixes in R1, and perform
|
||||
* the action indicated.
|
||||
*
|
||||
* @return boolean True when something is done.
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// (a) kin kaan kään ko kö han hän pa pä
|
||||
// delete if preceded by n, t or a vowel
|
||||
if (($position = $this->searchIfInR1(array('kaan', 'kään', 'kin', 'han', 'hän', 'ko', 'kö', 'pa', 'pä'))) !== false) {
|
||||
$lastLetter = Utf8::substr($this->word, ($position-1), 1);
|
||||
|
||||
if (in_array($lastLetter, array_merge(['t', 'n'], self::$vowels))) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// sti
|
||||
// delete if in R2
|
||||
if (($position = $this->searchIfInR1(array('sti'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2: possessives.
|
||||
*
|
||||
* Search for the longest among the following suffixes in R1, and perform
|
||||
* the action indicated.
|
||||
*
|
||||
* @return boolean True when something is done.
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
// si
|
||||
// delete if not preceded by k
|
||||
if (($position = $this->searchIfInR1(array('si'))) !== false) {
|
||||
$lastLetter = Utf8::substr($this->word, ($position-1), 1);
|
||||
|
||||
if ($lastLetter !== 'k') {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// ni
|
||||
// delete
|
||||
if (($position = $this->searchIfInR1(array('ni'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
// if preceded by kse, replace with ksi
|
||||
if ( ($position = $this->search(array('kse'))) !== false) {
|
||||
$this->word = preg_replace('#(kse)$#u', 'ksi', $this->word);
|
||||
}
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
return true;
|
||||
}
|
||||
|
||||
// nsa nsä mme nne
|
||||
// delete
|
||||
if (($position = $this->searchIfInR1(array('nsa', 'nsä', 'mme', 'nne'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
return true;
|
||||
}
|
||||
|
||||
// an
|
||||
// delete if preceded by one of ta ssa sta lla lta na
|
||||
if (($position = $this->searchIfInR1(array('an'))) !== false) {
|
||||
$word = Utf8::substr($this->word, 0, $position);
|
||||
$lastThreeLetters = Utf8::substr($word, -3, 3);
|
||||
$lastTwoLetters = Utf8::substr($word, -2, 2);
|
||||
if (in_array($lastThreeLetters, array('ssa', 'sta', 'lla', 'lta'), true) || in_array($lastTwoLetters, array('na', 'ta'), true)) {
|
||||
$this->word = $word;
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// än
|
||||
// delete if preceded by one of tä ssä stä llä ltä nä
|
||||
if (($position = $this->searchIfInR1(array('än'))) !== false) {
|
||||
$word = Utf8::substr($this->word, 0, $position);
|
||||
$lastThreeLetters = Utf8::substr($word, -3, 3);
|
||||
$lastTwoLetters = Utf8::substr($word, -2, 2);
|
||||
if (in_array($lastThreeLetters, array('ssä', 'stä', 'llä', 'ltä'), true) || in_array($lastTwoLetters, array('nä', 'tä'), true)) {
|
||||
$this->word = $word;
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// en
|
||||
// delete if preceded by one of lle ine
|
||||
if (($position = $this->searchIfInR1(array('en'))) !== false) {
|
||||
$word = Utf8::substr($this->word, 0, $position);
|
||||
if (Utf8::strlen($this->word) > 4) {
|
||||
$lastThreeLetters = Utf8::substr($this->word, -5, 3);
|
||||
if (in_array($lastThreeLetters, array('lle', 'ine'), true)) {
|
||||
$this->word = $word;
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3: cases
|
||||
*
|
||||
* Search for the longest among the following suffixes in R1, and perform
|
||||
* the action indicated.
|
||||
*
|
||||
* @return boolean True when something is done.
|
||||
*/
|
||||
private function step3()
|
||||
{
|
||||
// hXn
|
||||
// delete if preceded by X, where X is a V other than u (a/han, e/hen etc)
|
||||
foreach (self::$restrictedVowels as $vowel) {
|
||||
if ($vowel === 'u') {
|
||||
continue;
|
||||
}
|
||||
if (($position = $this->searchIfInR1(array('h' . $vowel . 'n'))) !== false) {
|
||||
$lastLetter = Utf8::substr($this->word, $position-1, 1);
|
||||
if ($lastLetter === $vowel) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->_removedInStep3 = true;
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// siin den tten
|
||||
// delete if preceded by Vi
|
||||
if (($position = $this->searchIfInR1(array('siin', 'den', 'tten'))) !== false) {
|
||||
$lastLetter = Utf8::substr($this->word, ($position-1), 1);
|
||||
if ($lastLetter === 'i') {
|
||||
$nextLastLetter = Utf8::substr($this->word, ($position-2), 1);
|
||||
if (in_array($nextLastLetter, self::$restrictedVowels, true)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->_removedInStep3 = true;
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// seen
|
||||
// delete if preceded by LV
|
||||
if (($position = $this->searchIfInR1(array('seen'))) !== false) {
|
||||
$lastLetters = Utf8::substr($this->word, ($position-2), 2);
|
||||
|
||||
if (in_array($lastLetters, self::$longVowels, true)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->_removedInStep3 = true;
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// tta ttä
|
||||
// delete if preceded by e
|
||||
if (($position = $this->searchIfInR1(array('tta', 'ttä'))) !== false) {
|
||||
$lastLetter = Utf8::substr($this->word, ($position-1), 1);
|
||||
|
||||
if ($lastLetter === 'e') {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->_removedInStep3 = true;
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// ta tä ssa ssä sta stä lla llä lta ltä lle na nä ksi ine
|
||||
// delete
|
||||
if (($position = $this->searchIfInR1(array('ssa', 'ssä', 'sta', 'stä', 'lla', 'llä', 'lta', 'ltä', 'lle', 'ksi', 'na', 'nä', 'ine', 'ta', 'tä'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->_removedInStep3 = true;
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
return true;
|
||||
}
|
||||
|
||||
// a ä
|
||||
// delete if preceded by cv
|
||||
if (($position = $this->searchIfInR1(array('a', 'ä'))) !== false) {
|
||||
$lastLetter = Utf8::substr($this->word, ($position-1), 1);
|
||||
$nextLastLetter = Utf8::substr($this->word, ($position-2), 1);
|
||||
|
||||
if (in_array($lastLetter, self::$vowels, true) && in_array($nextLastLetter, self::$consonants, true)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->_removedInStep3 = true;
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// n
|
||||
// delete, and if preceded by LV or ie, delete the last vowel
|
||||
if (($position = $this->searchIfInR1(array('n'))) !== false) {
|
||||
$lastLetters = Utf8::substr($this->word, ($position-2), 2);
|
||||
|
||||
if (in_array($lastLetters, self::$longVowels, true) || $lastLetters === 'ie') {
|
||||
$this->word = Utf8::substr($this->word, 0, $position-1);
|
||||
} else {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
}
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
$this->_removedInStep3 = true;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 4: other endings
|
||||
*
|
||||
* Search for the longest among the following suffixes in R2, and perform
|
||||
* the action indicated
|
||||
*
|
||||
* @return boolean True when something is done.
|
||||
*/
|
||||
private function step4()
|
||||
{
|
||||
// mpi mpa mpä mmi mma mmä
|
||||
// delete if not preceded by po
|
||||
if (($position = $this->searchIfInR2(array('mpi', 'mpa', 'mpä', 'mmi', 'mma', 'mmä'))) !== false) {
|
||||
$lastLetters = Utf8::substr($this->word, ($position-2), 2);
|
||||
if ($lastLetters !== 'po') {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// impi impa impä immi imma immä eja ejä
|
||||
// delete
|
||||
if (($position = $this->searchIfInR2(array('impi', 'impa', 'impä', 'immi', 'imma', 'immä', 'eja', 'ejä'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 5: plurals
|
||||
* If an ending was removed in step 3, delete a final i or j if in R1;
|
||||
* otherwise,
|
||||
* if an ending was not removed in step 3, delete a final t in R1 if it
|
||||
* follows a vowel, and, if a t is removed, delete a final mma or imma in
|
||||
* R2, unless the mma is preceded by po.
|
||||
*
|
||||
* @return boolean True when something is done.
|
||||
*/
|
||||
private function step5()
|
||||
{
|
||||
if ($this->_removedInStep3) {
|
||||
if (($position = $this->searchIfInR1(array('i', 'j'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
if (($position = $this->searchIfInR1(array('t'))) !== false) {
|
||||
$lastLetter = Utf8::substr($this->word, ($position-1), 1);
|
||||
if (in_array($lastLetter, self::$vowels, true)) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position);
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
if (($position2 = $this->searchIfInR2(array('imma'))) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
return true;
|
||||
} elseif (($position2 = $this->searchIfInR2(array('mma'))) !== false) {
|
||||
$lastLetters = Utf8::substr($this->word, ($position2-2), 2);
|
||||
if ($lastLetters !== 'po') {
|
||||
$this->word = Utf8::substr($this->word, 0, $position2);
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 6: tidying up
|
||||
*
|
||||
* Do in turn steps (a), (b), (c), (d), restricting all tests to the
|
||||
* region R1.
|
||||
*/
|
||||
private function step6()
|
||||
{
|
||||
// a) If R1 ends LV
|
||||
// delete the last letter
|
||||
if (($position = $this->searchIfInR1(self::$longVowels)) !== false) {
|
||||
$this->word = Utf8::substr($this->word, 0, $position+1);
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
}
|
||||
|
||||
// b) If R1 ends cX, c a consonant and X one of a ä e i,
|
||||
// delete the last letter
|
||||
$lastLetter = Utf8::substr($this->r1, -1, 1);
|
||||
$secondToLastLetter = Utf8::substr($this->r1, -2, 1);
|
||||
if (in_array($secondToLastLetter, self::$consonants, true) && in_array($lastLetter, array('a', 'e', 'i', 'ä'))) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
}
|
||||
|
||||
// c) If R1 ends oj or uj
|
||||
// delete the last letter
|
||||
$twoLastLetters = Utf8::substr($this->r1, -2, 2);
|
||||
if (in_array($twoLastLetters, array('oj', 'uj'))) {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
}
|
||||
|
||||
// d) If R1 ends jo
|
||||
// delete the last letter
|
||||
$twoLastLetters = Utf8::substr($this->r1, -2, 2);
|
||||
if ($twoLastLetters === 'jo') {
|
||||
$this->word = Utf8::substr($this->word, 0, -1);
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
}
|
||||
|
||||
// e) If the word ends with a double consonant followed by zero or more
|
||||
// vowels, remove the last consonant (so eläkk -> eläk,
|
||||
// aatonaatto -> aatonaato)
|
||||
$endVowels = '';
|
||||
for ($i = Utf8::strlen($this->word) - 1; $i > 0; $i--) {
|
||||
$letter = Utf8::substr($this->word, $i, 1);
|
||||
if (in_array($letter, self::$vowels, true)) {
|
||||
$endVowels = $letter . $endVowels;
|
||||
} else {
|
||||
// check for double consonant
|
||||
$prevLetter = Utf8::substr($this->word, $i-1, 1);
|
||||
if ($prevLetter === $letter) {
|
||||
$this->word = Utf8::substr($this->word, 0, $i) . $endVowels;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
533
libraries/vendor/wamania/php-stemmer/src/Stemmer/French.php
vendored
Normal file
533
libraries/vendor/wamania/php-stemmer/src/Stemmer/French.php
vendored
Normal file
@ -0,0 +1,533 @@
|
||||
<?php
|
||||
|
||||
namespace Wamania\Snowball\Stemmer;
|
||||
|
||||
use voku\helper\UTF8;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/french/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class French extends Stem
|
||||
{
|
||||
/**
|
||||
* All french vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'â', 'à', 'ë', 'é', 'ê', 'è', 'ï', 'î', 'ô', 'û', 'ù');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (!UTF8::is_utf8($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = UTF8::strtolower($word);
|
||||
|
||||
$this->plainVowels = implode('', self::$vowels);
|
||||
|
||||
$this->step0();
|
||||
|
||||
$this->rv();
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
|
||||
// to know if step1, 2a or 2b have altered the word
|
||||
$this->originalWord = $this->word;
|
||||
|
||||
$nextStep = $this->step1();
|
||||
|
||||
// Do step 2a if either no ending was removed by step 1, or if one of endings amment, emment, ment, ments was found.
|
||||
if ( ($nextStep == 2) || ($this->originalWord == $this->word) ) {
|
||||
$modified = $this->step2a();
|
||||
if (!$modified) {
|
||||
$this->step2b();
|
||||
}
|
||||
}
|
||||
|
||||
if ($this->word != $this->originalWord) {
|
||||
$this->step3();
|
||||
|
||||
} else {
|
||||
$this->step4();
|
||||
}
|
||||
|
||||
$this->step5();
|
||||
$this->step6();
|
||||
$this->finish();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Assume the word is in lower case.
|
||||
* Then put into upper case u or i preceded and followed by a vowel, and y preceded or followed by a vowel.
|
||||
* u after q is also put into upper case. For example,
|
||||
* jouer -> joUer
|
||||
* ennuie -> ennuIe
|
||||
* yeux -> Yeux
|
||||
* quand -> qUand
|
||||
*/
|
||||
private function step0()
|
||||
{
|
||||
$this->word = preg_replace('#([q])u#u', '$1U', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word);
|
||||
$this->word = preg_replace('#y(['.$this->plainVowels.'])#u', 'Y$1', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word);
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1
|
||||
* Search for the longest among the following suffixes, and perform the action indicated.
|
||||
*
|
||||
* @return integer Next step number
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// ance iqUe isme able iste eux ances iqUes ismes ables istes
|
||||
// delete if in R2
|
||||
if ( ($position = $this->search(array('ances', 'iqUes', 'ismes', 'ables', 'istes', 'ance', 'iqUe','isme', 'able', 'iste', 'eux'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
// atrice ateur ation atrices ateurs ations
|
||||
// delete if in R2
|
||||
// if preceded by ic, delete if in R2, else replace by iqU
|
||||
if ( ($position = $this->search(array('atrices', 'ateurs', 'ations', 'atrice', 'ateur', 'ation'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
|
||||
if ( ($position2 = $this->searchIfInR2(array('ic'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
} else {
|
||||
$this->word = preg_replace('#(ic)$#u', 'iqU', $this->word);
|
||||
}
|
||||
}
|
||||
|
||||
return 3;
|
||||
}
|
||||
|
||||
// logie logies
|
||||
// replace with log if in R2
|
||||
if ( ($position = $this->search(array('logies', 'logie'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(logies|logie)$#u', 'log', $this->word);
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
// usion ution usions utions
|
||||
// replace with u if in R2
|
||||
if ( ($position = $this->search(array('usions', 'utions', 'usion', 'ution'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(usion|ution|usions|utions)$#u', 'u', $this->word);
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
// ence ences
|
||||
// replace with ent if in R2
|
||||
if ( ($position = $this->search(array('ences', 'ence'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(ence|ences)$#u', 'ent', $this->word);
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
// issement issements
|
||||
// delete if in R1 and preceded by a non-vowel
|
||||
if ( ($position = $this->search(array('issements', 'issement'))) != false) {
|
||||
if ($this->inR1($position)) {
|
||||
$before = $position - 1;
|
||||
$letter = UTF8::substr($this->word, $before, 1);
|
||||
if (! in_array($letter, self::$vowels)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
// ement ements
|
||||
// delete if in RV
|
||||
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
|
||||
// if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise,
|
||||
// if preceded by abl or iqU, delete if in R2, otherwise,
|
||||
// if preceded by ièr or Ièr, replace by i if in RV
|
||||
if ( ($position = $this->search(array('ements', 'ement'))) !== false) {
|
||||
|
||||
// delete if in RV
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
|
||||
if ( ($position = $this->searchIfInR2(array('iv'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
}
|
||||
|
||||
// if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise,
|
||||
} elseif ( ($position = $this->search(array('eus'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
|
||||
} elseif ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(eus)$#u', 'eux', $this->word);
|
||||
}
|
||||
|
||||
// if preceded by abl or iqU, delete if in R2, otherwise,
|
||||
} elseif ( ($position = $this->searchIfInR2(array('abl', 'iqU'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
|
||||
// if preceded by ièr or Ièr, replace by i if in RV
|
||||
} elseif ( ($position = $this->searchIfInRv(array('ièr', 'Ièr'))) !== false) {
|
||||
$this->word = preg_replace('#(ièr|Ièr)$#u', 'i', $this->word);
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
// ité ités
|
||||
// delete if in R2
|
||||
// if preceded by abil, delete if in R2, else replace by abl, otherwise,
|
||||
// if preceded by ic, delete if in R2, else replace by iqU, otherwise,
|
||||
// if preceded by iv, delete if in R2
|
||||
if ( ($position = $this->search(array('ités', 'ité'))) !== false) {
|
||||
|
||||
// delete if in R2
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by abil, delete if in R2, else replace by abl, otherwise,
|
||||
if ( ($position = $this->search(array('abil'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
} else {
|
||||
$this->word = preg_replace('#(abil)$#u', 'abl', $this->word);
|
||||
}
|
||||
|
||||
// if preceded by ic, delete if in R2, else replace by iqU, otherwise,
|
||||
} elseif ( ($position = $this->search(array('ic'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
} else {
|
||||
$this->word = preg_replace('#(ic)$#u', 'iqU', $this->word);
|
||||
}
|
||||
|
||||
// if preceded by iv, delete if in R2
|
||||
} elseif ( ($position = $this->searchIfInR2(array('iv'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
return 3;
|
||||
}
|
||||
|
||||
// if ive ifs ives
|
||||
// delete if in R2
|
||||
// if preceded by at, delete if in R2 (and if further preceded by ic, delete if in R2, else replace by iqU)
|
||||
if ( ($position = $this->search(array('ifs', 'ives', 'if', 'ive'))) !== false) {
|
||||
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
if ( ($position = $this->searchIfInR2(array('at'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
|
||||
if ( ($position2 = $this->search(array('ic'))) !== false) {
|
||||
if ($this->inR2($position2)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
} else {
|
||||
$this->word = preg_replace('#(ic)$#u', 'iqU', $this->word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 3;
|
||||
}
|
||||
|
||||
// eaux
|
||||
// replace with eau
|
||||
if ( ($position = $this->search(array('eaux'))) !== false) {
|
||||
$this->word = preg_replace('#(eaux)$#u', 'eau', $this->word);
|
||||
return 3;
|
||||
}
|
||||
|
||||
// aux
|
||||
// replace with al if in R1
|
||||
if ( ($position = $this->search(array('aux'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(aux)$#u', 'al', $this->word);
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
// euse euses
|
||||
// delete if in R2, else replace by eux if in R1
|
||||
if ( ($position = $this->search(array('euses', 'euse'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
|
||||
} elseif ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(euses|euse)$#u', 'eux', $this->word);
|
||||
//return 3;
|
||||
}
|
||||
return 3;
|
||||
}
|
||||
|
||||
// amment
|
||||
// replace with ant if in RV
|
||||
if ( ($position = $this->search(array('amment'))) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = preg_replace('#(amment)$#u', 'ant', $this->word);
|
||||
}
|
||||
return 2;
|
||||
}
|
||||
|
||||
// emment
|
||||
// replace with ent if in RV
|
||||
if ( ($position = $this->search(array('emment'))) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = preg_replace('#(emment)$#u', 'ent', $this->word);
|
||||
}
|
||||
return 2;
|
||||
}
|
||||
|
||||
// ment ments
|
||||
// delete if preceded by a vowel in RV
|
||||
if ( ($position = $this->search(array('ments', 'ment'))) != false) {
|
||||
$before = $position - 1;
|
||||
$letter = UTF8::substr($this->word, $before, 1);
|
||||
if ( $this->inRv($before) && (in_array($letter, self::$vowels)) ) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2a: Verb suffixes beginning i
|
||||
* In steps 2a and 2b all tests are confined to the RV region.
|
||||
* Search for the longest among the following suffixes and if found, delete if preceded by a non-vowel.
|
||||
* îmes ît îtes i ie ies ir ira irai iraIent irais irait iras irent irez iriez
|
||||
* irions irons iront is issaIent issais issait issant issante issantes issants isse
|
||||
* issent isses issez issiez issions issons it
|
||||
* (Note that the non-vowel itself must also be in RV.)
|
||||
*/
|
||||
private function step2a()
|
||||
{
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'îmes', 'îtes', 'ît', 'ies', 'ie', 'iraIent', 'irais', 'irait', 'irai', 'iras', 'ira', 'irent', 'irez', 'iriez',
|
||||
'irions', 'irons', 'iront', 'ir', 'issaIent', 'issais', 'issait', 'issant', 'issantes', 'issante', 'issants',
|
||||
'issent', 'isses', 'issez', 'isse', 'issiez', 'issions', 'issons', 'is', 'it', 'i'))) !== false) {
|
||||
|
||||
$before = $position - 1;
|
||||
$letter = UTF8::substr($this->word, $before, 1);
|
||||
if ( $this->inRv($before) && (!in_array($letter, self::$vowels)) ) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Do step 2b if step 2a was done, but failed to remove a suffix.
|
||||
* Step 2b: Other verb suffixes
|
||||
*/
|
||||
private function step2b()
|
||||
{
|
||||
// é ée ées és èrent er era erai eraIent erais erait eras erez eriez erions erons eront ez iez
|
||||
// delete
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'ées', 'èrent', 'erais', 'erait', 'erai', 'eraIent', 'eras', 'erez', 'eriez',
|
||||
'erions', 'erons', 'eront', 'era', 'er', 'iez', 'ez','és', 'ée', 'é'))) !== false) {
|
||||
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// âmes ât âtes a ai aIent ais ait ant ante antes ants as asse assent asses assiez assions
|
||||
// delete
|
||||
// if preceded by e, delete
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'âmes', 'âtes', 'ât', 'aIent', 'ais', 'ait', 'antes', 'ante', 'ants', 'ant',
|
||||
'assent', 'asses', 'assiez', 'assions', 'asse', 'as', 'ai', 'a'))) !== false) {
|
||||
|
||||
$before = $position - 1;
|
||||
$letter = UTF8::substr($this->word, $before, 1);
|
||||
if ( $this->inRv($before) && ($letter == 'e') ) {
|
||||
$this->word = UTF8::substr($this->word, 0, $before);
|
||||
|
||||
} else {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// ions
|
||||
// delete if in R2
|
||||
if ( ($position = $this->searchIfInRv(array('ions'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3: Replace final Y with i or final ç with c
|
||||
*/
|
||||
private function step3()
|
||||
{
|
||||
$this->word = preg_replace('#(Y)$#u', 'i', $this->word);
|
||||
$this->word = preg_replace('#(ç)$#u', 'c', $this->word);
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 4: Residual suffix
|
||||
*/
|
||||
private function step4()
|
||||
{
|
||||
//If the word ends s, not preceded by a, i, o, u, è or s, delete it.
|
||||
if (preg_match('#[^aiouès]s$#', $this->word)) {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
}
|
||||
|
||||
// In the rest of step 4, all tests are confined to the RV region.
|
||||
// ion
|
||||
// delete if in R2 and preceded by s or t
|
||||
if ( (($position = $this->searchIfInRv(array('ion'))) !== false) && ($this->inR2($position)) ) {
|
||||
$before = $position - 1;
|
||||
$letter = UTF8::substr($this->word, $before, 1);
|
||||
if ( $this->inRv($before) && (($letter == 's') || ($letter == 't')) ) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ier ière Ier Ière
|
||||
// replace with i
|
||||
if ( ($this->searchIfInRv(array('ier', 'ière', 'Ier', 'Ière'))) !== false) {
|
||||
$this->word = preg_replace('#(ier|ière|Ier|Ière)$#u', 'i', $this->word);
|
||||
return true;
|
||||
}
|
||||
|
||||
// e
|
||||
// delete
|
||||
if ( ($this->searchIfInRv(array('e'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
return true;
|
||||
}
|
||||
|
||||
// ë
|
||||
// if preceded by gu, delete
|
||||
if ( ($position = $this->searchIfInRv(array('guë'))) !== false) {
|
||||
if ($this->inRv($position+2)) {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 5: Undouble
|
||||
* If the word ends enn, onn, ett, ell or eill, delete the last letter
|
||||
*/
|
||||
private function step5()
|
||||
{
|
||||
if ($this->search(array('enn', 'onn', 'ett', 'ell', 'eill')) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 6: Un-accent
|
||||
* If the words ends é or è followed by at least one non-vowel, remove the accent from the e.
|
||||
*/
|
||||
private function step6()
|
||||
{
|
||||
$this->word = preg_replace('#(é|è)([^'.$this->plainVowels.']+)$#u', 'e$2', $this->word);
|
||||
}
|
||||
|
||||
/**
|
||||
* And finally:
|
||||
* Turn any remaining I, U and Y letters in the word back into lower case.
|
||||
*/
|
||||
private function finish()
|
||||
{
|
||||
$this->word = UTF8::str_replace(array('I','U','Y'), array('i', 'u', 'y'), $this->word);
|
||||
}
|
||||
|
||||
/**
|
||||
* If the word begins with two vowels, RV is the region after the third letter,
|
||||
* otherwise the region after the first vowel not at the beginning of the word,
|
||||
* or the end of the word if these positions cannot be found.
|
||||
* (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.)
|
||||
*/
|
||||
protected function rv()
|
||||
{
|
||||
$length = UTF8::strlen($this->word);
|
||||
|
||||
$this->rv = '';
|
||||
$this->rvIndex = $length;
|
||||
|
||||
if ($length < 3) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// If the word begins with two vowels, RV is the region after the third letter
|
||||
$first = UTF8::substr($this->word, 0, 1);
|
||||
$second = UTF8::substr($this->word, 1, 1);
|
||||
|
||||
if ( (in_array($first, self::$vowels)) && (in_array($second, self::$vowels)) ) {
|
||||
$this->rv = UTF8::substr($this->word, 3);
|
||||
$this->rvIndex = 3;
|
||||
return true;
|
||||
}
|
||||
|
||||
// (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.)
|
||||
$begin3 = UTF8::substr($this->word, 0, 3);
|
||||
if (in_array($begin3, array('par', 'col', 'tap'))) {
|
||||
$this->rv = UTF8::substr($this->word, 3);
|
||||
$this->rvIndex = 3;
|
||||
return true;
|
||||
}
|
||||
|
||||
// otherwise the region after the first vowel not at the beginning of the word,
|
||||
for ($i=1; $i<$length; $i++) {
|
||||
$letter = UTF8::substr($this->word, $i, 1);
|
||||
if (in_array($letter, self::$vowels)) {
|
||||
$this->rv = UTF8::substr($this->word, ($i + 1));
|
||||
$this->rvIndex = $i + 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
216
libraries/vendor/wamania/php-stemmer/src/Stemmer/German.php
vendored
Normal file
216
libraries/vendor/wamania/php-stemmer/src/Stemmer/German.php
vendored
Normal file
@ -0,0 +1,216 @@
|
||||
<?php
|
||||
|
||||
namespace Wamania\Snowball\Stemmer;
|
||||
|
||||
use voku\helper\UTF8;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/german/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class German extends Stem
|
||||
{
|
||||
/**
|
||||
* All German vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü');
|
||||
|
||||
protected static $sEndings = array('b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r' ,'t');
|
||||
|
||||
protected static $stEndings = array('b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (!UTF8::is_utf8($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->plainVowels = implode('', self::$vowels);
|
||||
|
||||
$this->word = UTF8::strtolower($word);
|
||||
|
||||
// First, replace ß by ss
|
||||
$this->word = UTF8::str_replace('ß', 'ss', $this->word);
|
||||
|
||||
// put u and y between vowels into upper case
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])y(['.$this->plainVowels.'])#u', '$1Y$2', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word);
|
||||
|
||||
// R1 and R2 are first set up in the standard way
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
|
||||
// but then R1 is adjusted so that the region before it contains at least 3 letters.
|
||||
if ($this->r1Index < 3) {
|
||||
$this->r1Index = 3;
|
||||
$this->r1 = UTF8::substr($this->word, 3);
|
||||
}
|
||||
|
||||
$this->step1();
|
||||
$this->step2();
|
||||
$this->step3();
|
||||
$this->finish();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// delete if in R1
|
||||
if ( ($position = $this->search(array('em', 'ern', 'er'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// delete if in R1
|
||||
if ( ($position = $this->search(array('es', 'en', 'e'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
|
||||
//If an ending of group (b) is deleted, and the ending is preceded by niss, delete the final s
|
||||
if ($this->search(array('niss')) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// s (preceded by a valid s-ending)
|
||||
if ( ($position = $this->search(array('s'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$before = $position - 1;
|
||||
$letter = UTF8::substr($this->word, $before, 1);
|
||||
|
||||
if (in_array($letter, self::$sEndings)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
// en er est
|
||||
// delete if in R1
|
||||
if ( ($position = $this->search(array('en', 'er', 'est'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// st (preceded by a valid st-ending, itself preceded by at least 3 letters)
|
||||
// delete if in R1
|
||||
if ( ($position = $this->search(array('st'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$before = $position - 1;
|
||||
if ($before >= 3) {
|
||||
$letter = UTF8::substr($this->word, $before, 1);
|
||||
|
||||
if (in_array($letter, self::$stEndings)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3: d-suffixes
|
||||
*/
|
||||
private function step3()
|
||||
{
|
||||
// end ung
|
||||
// delete if in R2
|
||||
// if preceded by ig, delete if in R2 and not preceded by e
|
||||
if ( ($position = $this->search(array('end', 'ung'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
if ( ($position2 = $this->search(array('ig'))) !== false) {
|
||||
$before = $position2 - 1;
|
||||
$letter = UTF8::substr($this->word, $before, 1);
|
||||
|
||||
if ( ($this->inR2($position2)) && ($letter != 'e') ) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ig ik isch
|
||||
// delete if in R2 and not preceded by e
|
||||
if ( ($position = $this->search(array('ig', 'ik', 'isch'))) !== false) {
|
||||
$before = $position - 1;
|
||||
$letter = UTF8::substr($this->word, $before, 1);
|
||||
|
||||
if ( ($this->inR2($position)) && ($letter != 'e') ) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// lich heit
|
||||
// delete if in R2
|
||||
// if preceded by er or en, delete if in R1
|
||||
if ( ($position = $this->search(array('lich', 'heit'))) != false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
if ( ($position2 = $this->search(array('er', 'en'))) !== false) {
|
||||
if ($this->inR1($position2)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// keit
|
||||
// delete if in R2
|
||||
// if preceded by lich or ig, delete if in R2
|
||||
if ( ($position = $this->search(array('keit'))) != false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
if ( ($position2 = $this->search(array('lich', 'ig'))) !== false) {
|
||||
if ($this->inR2($position2)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finally
|
||||
*/
|
||||
private function finish()
|
||||
{
|
||||
// turn U and Y back into lower case, and remove the umlaut accent from a, o and u.
|
||||
$this->word = UTF8::str_replace(array('U', 'Y', 'ä', 'ü', 'ö'), array('u', 'y', 'a', 'u', 'o'), $this->word);
|
||||
}
|
||||
}
|
||||
289
libraries/vendor/wamania/php-stemmer/src/Stemmer/Italian.php
vendored
Normal file
289
libraries/vendor/wamania/php-stemmer/src/Stemmer/Italian.php
vendored
Normal file
@ -0,0 +1,289 @@
|
||||
<?php
|
||||
|
||||
namespace Wamania\Snowball\Stemmer;
|
||||
|
||||
use voku\helper\UTF8;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/italian/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Italian extends Stem
|
||||
{
|
||||
/**
|
||||
* All Italian vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'à', 'è', 'ì', 'ò', 'ù');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (!UTF8::is_utf8($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->plainVowels = implode('', self::$vowels);
|
||||
|
||||
$this->word = UTF8::strtolower($word);
|
||||
|
||||
// First, replace all acute accents by grave accents.
|
||||
$this->word = UTF8::str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('à', 'è', 'ì', 'ò', 'ù'), $this->word);
|
||||
|
||||
//And, as in French, put u after q, and u, i between vowels into upper case. (See note on vowel marking.) The vowels are then
|
||||
$this->word = preg_replace('#([q])u#u', '$1U', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word);
|
||||
|
||||
$this->rv();
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
|
||||
$this->step0();
|
||||
|
||||
$word = $this->word;
|
||||
$this->step1();
|
||||
|
||||
//Do step 2 if no ending was removed by step 1.
|
||||
if ($word == $this->word) {
|
||||
$this->step2();
|
||||
}
|
||||
|
||||
$this->step3a();
|
||||
$this->step3b();
|
||||
$this->finish();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 0: Attached pronoun
|
||||
*/
|
||||
private function step0()
|
||||
{
|
||||
// Search for the longest among the following suffixes
|
||||
if ( ($position = $this->search(array(
|
||||
'gliela', 'gliele', 'glieli', 'glielo', 'gliene',
|
||||
'sene', 'mela', 'mele', 'meli', 'melo', 'mene', 'tela', 'tele', 'teli', 'telo', 'tene', 'cela',
|
||||
'cele', 'celi', 'celo', 'cene', 'vela', 'vele', 'veli', 'velo', 'vene',
|
||||
'gli', 'la', 'le', 'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi', 'ci'))) !== false) {
|
||||
|
||||
$suffixe = UTF8::substr($this->word, $position);
|
||||
|
||||
// following one of (in RV)
|
||||
// a
|
||||
$a = array('ando', 'endo');
|
||||
$a = array_map(function($item) use ($suffixe) {
|
||||
return $item . $suffixe;
|
||||
}, $a);
|
||||
// In case of (a) the suffix is deleted
|
||||
if ($this->searchIfInRv($a) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
//b
|
||||
$b = array('ar', 'er', 'ir');
|
||||
$b = array_map(function($item) use ($suffixe) {
|
||||
return $item . $suffixe;
|
||||
}, $b);
|
||||
// in case (b) it is replace by e
|
||||
if ($this->searchIfInRv($b) !== false) {
|
||||
$this->word = preg_replace('#('.$suffixe.')$#u', 'e', $this->word);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1: Standard suffix removal
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// amente
|
||||
// delete if in R1
|
||||
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
|
||||
// if preceded by os, ic or abil, delete if in R2
|
||||
if ( ($position = $this->search(array('amente'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
|
||||
if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position3);
|
||||
}
|
||||
|
||||
// if preceded by os, ic or ad, delete if in R2
|
||||
} elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'abil'))) != false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position4);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// delete if in R2
|
||||
if ( ($position = $this->search(array(
|
||||
'ibili', 'atrice', 'abili', 'abile', 'ibile', 'atrici', 'mente',
|
||||
'anza', 'anze', 'iche', 'ichi', 'ismo', 'ismi', 'ista', 'iste', 'isti', 'istà', 'istè', 'istì', 'ante', 'anti',
|
||||
'ico', 'ici', 'ica', 'ice', 'oso', 'osi', 'osa', 'ose'
|
||||
))) !== false) {
|
||||
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// azione azioni atore atori
|
||||
// delete if in R2
|
||||
// if preceded by ic, delete if in R2
|
||||
if ( ($position = $this->search(array('azione', 'azioni', 'atore', 'atori'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
|
||||
if ( ($position2 = $this->search(array('ic'))) !== false) {
|
||||
if ($this->inR2($position2)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// logia logie
|
||||
// replace with log if in R2
|
||||
if ( ($position = $this->search(array('logia', 'logie'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(logia|logie)$#u', 'log', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// uzione uzioni usione usioni
|
||||
// replace with u if in R2
|
||||
if ( ($position = $this->search(array('uzione', 'uzioni', 'usione', 'usioni'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(uzione|uzioni|usione|usioni)$#u', 'u', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// enza enze
|
||||
// replace with ente if in R2
|
||||
if ( ($position = $this->search(array('enza', 'enze'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(enza|enze)$#u', 'ente', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// amento amenti imento imenti
|
||||
// delete if in RV
|
||||
if ( ($position = $this->search(array('amento', 'amenti', 'imento', 'imenti'))) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ità
|
||||
// delete if in R2
|
||||
// if preceded by abil, ic or iv, delete if in R2
|
||||
if ( ($position = $this->search(array('ità'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) != false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ivo ivi iva ive
|
||||
// delete if in R2
|
||||
// if preceded by at, delete if in R2 (and if further preceded by ic, delete if in R2)
|
||||
if ( ($position = $this->search(array('ivo', 'ivi', 'iva', 'ive'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
if ( ($position3 = $this->searchIfInR2(array('ic'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position3);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2: Verb suffixes
|
||||
* Search for the longest among the following suffixes in RV, and if found, delete.
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'assimo', 'assero', 'eranno', 'erebbero', 'erebbe', 'eremmo', 'ereste', 'eresti', 'essero', 'iranno', 'irebbero', 'irebbe', 'iremmo',
|
||||
'iscano', 'ireste', 'iresti', 'iscono', 'issero',
|
||||
'avamo', 'arono', 'avano', 'avate', 'eremo', 'erete', 'erono', 'evamo', 'evano', 'evate', 'ivamo', 'ivano', 'ivate', 'iremo', 'irete', 'irono',
|
||||
'ammo', 'ando', 'asse', 'assi', 'emmo', 'enda', 'ende', 'endi', 'endo', 'erai', 'erei', 'Yamo', 'iamo', 'immo', 'irà', 'irai', 'irei',
|
||||
'isca', 'isce', 'isci', 'isco',
|
||||
'ano', 'are', 'ata', 'ate', 'ati', 'ato', 'ava', 'avi', 'avo', 'erà', 'ere', 'erò', 'ete', 'eva',
|
||||
'evi', 'evo', 'ire', 'ita', 'ite', 'iti', 'ito', 'iva', 'ivi', 'ivo', 'ono', 'uta', 'ute', 'uti', 'uto', 'irò', 'ar', 'ir'))) !== false) {
|
||||
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3a
|
||||
* Delete a final a, e, i, o, à, è, ì or ò if it is in RV, and a preceding i if it is in RV
|
||||
*/
|
||||
private function step3a()
|
||||
{
|
||||
if ($this->searchIfInRv(array('a', 'e', 'i', 'o', 'à', 'è', 'ì', 'ò')) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
|
||||
if ($this->searchIfInRv(array('i')) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3b
|
||||
* Replace final ch (or gh) with c (or g) if in RV (crocch -> crocc)
|
||||
*/
|
||||
private function step3b()
|
||||
{
|
||||
if ($this->searchIfInRv(array('ch')) !== false) {
|
||||
$this->word = preg_replace('#(ch)$#u', 'c', $this->word);
|
||||
|
||||
} elseif ($this->searchIfInRv(array('gh')) !== false) {
|
||||
$this->word = preg_replace('#(gh)$#u', 'g', $this->word);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finally
|
||||
* turn I and U back into lower case
|
||||
*/
|
||||
private function finish()
|
||||
{
|
||||
$this->word = UTF8::str_replace(array('I', 'U'), array('i', 'u'), $this->word);
|
||||
}
|
||||
}
|
||||
130
libraries/vendor/wamania/php-stemmer/src/Stemmer/Norwegian.php
vendored
Normal file
130
libraries/vendor/wamania/php-stemmer/src/Stemmer/Norwegian.php
vendored
Normal file
@ -0,0 +1,130 @@
|
||||
<?php
|
||||
|
||||
namespace Wamania\Snowball\Stemmer;
|
||||
|
||||
use voku\helper\UTF8;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/norwegian/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Norwegian extends Stem
|
||||
{
|
||||
/**
|
||||
* All norwegian vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'æ', 'å', 'ø');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (!UTF8::is_utf8($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = UTF8::strtolower($word);
|
||||
|
||||
// R2 is not used: R1 is defined in the same way as in the German stemmer
|
||||
$this->r1();
|
||||
|
||||
// then R1 is adjusted so that the region before it contains at least 3 letters.
|
||||
if ($this->r1Index < 3) {
|
||||
$this->r1Index = 3;
|
||||
$this->r1 = UTF8::substr($this->word, 3);
|
||||
}
|
||||
|
||||
// Do each of steps 1, 2 3 and 4.
|
||||
$this->step1();
|
||||
$this->step2();
|
||||
$this->step3();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Define a valid s-ending as one of
|
||||
* b c d f g h j l m n o p r t v y z,
|
||||
* or k not preceded by a vowel
|
||||
*
|
||||
* @param string $ending
|
||||
* @return boolean
|
||||
*/
|
||||
private function hasValidSEnding($word)
|
||||
{
|
||||
$lastLetter = UTF8::substr($word, -1, 1);
|
||||
if (in_array($lastLetter, array('b', 'c', 'd', 'f', 'g', 'h', 'j', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z'))) {
|
||||
return true;
|
||||
}
|
||||
if ($lastLetter == 'k') {
|
||||
$beforeLetter = UTF8::substr($word, -2, 1);
|
||||
if (!in_array($beforeLetter, self::$vowels)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1
|
||||
* Search for the longest among the following suffixes in R1, and perform the action indicated.
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// erte ert
|
||||
// replace with er
|
||||
if ( ($position = $this->searchIfInR1(array('erte', 'ert'))) !== false) {
|
||||
$this->word = preg_replace('#(erte|ert)$#u', 'er', $this->word);
|
||||
return true;
|
||||
}
|
||||
|
||||
// a e ede ande ende ane ene hetene en heten ar er heter as es edes endes enes hetenes ens hetens ers ets et het ast
|
||||
// delete
|
||||
if ( ($position = $this->searchIfInR1(array(
|
||||
'hetenes', 'hetene', 'hetens', 'heten', 'endes', 'heter', 'ande', 'ende', 'enes', 'edes', 'ede', 'ane',
|
||||
'ene', 'het', 'ers', 'ets', 'ast', 'ens', 'en', 'ar', 'er', 'as', 'es', 'et', 'a', 'e'
|
||||
))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// s
|
||||
// delete if preceded by a valid s-ending
|
||||
if ( ($position = $this->searchIfInR1(array('s'))) !== false) {
|
||||
$word = UTF8::substr($this->word, 0, $position);
|
||||
if ($this->hasValidSEnding($word)) {
|
||||
$this->word = $word;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2
|
||||
* If the word ends dt or vt in R1, delete the t.
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
if ($this->searchIfInR1(array('dt', 'vt')) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3:
|
||||
* Search for the longest among the following suffixes in R1, and if found, delete.
|
||||
*/
|
||||
private function step3()
|
||||
{
|
||||
// leg eleg ig eig lig elig els lov elov slov hetslov
|
||||
if ( ($position = $this->searchIfInR1(array(
|
||||
'hetslov', 'eleg', 'elov', 'slov', 'elig', 'eig', 'lig', 'els', 'lov', 'leg', 'ig'
|
||||
))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
}
|
||||
283
libraries/vendor/wamania/php-stemmer/src/Stemmer/Portuguese.php
vendored
Normal file
283
libraries/vendor/wamania/php-stemmer/src/Stemmer/Portuguese.php
vendored
Normal file
@ -0,0 +1,283 @@
|
||||
<?php
|
||||
|
||||
namespace Wamania\Snowball\Stemmer;
|
||||
|
||||
use voku\helper\UTF8;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/portuguese/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Portuguese extends Stem
|
||||
{
|
||||
/**
|
||||
* All Portuguese vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'á', 'é', 'í', 'ó', 'ú', 'â', 'ê', 'ô');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (!UTF8::is_utf8($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = UTF8::strtolower($word);
|
||||
|
||||
$this->word = UTF8::str_replace(array('ã', 'õ'), array('a~', 'o~'), $this->word);
|
||||
|
||||
$this->rv();
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
|
||||
$word = $this->word;
|
||||
$this->step1();
|
||||
|
||||
if ($word == $this->word) {
|
||||
$this->step2();
|
||||
}
|
||||
|
||||
if ($word != $this->word) {
|
||||
$this->step3();
|
||||
} else {
|
||||
$this->step4();
|
||||
}
|
||||
|
||||
$this->step5();
|
||||
$this->finish();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1: Standard suffix removal
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// delete if in R2
|
||||
if ( ($position = $this->search(array(
|
||||
'amentos', 'imentos', 'adoras', 'adores', 'amento', 'imento', 'adora', 'istas', 'ismos', 'antes', 'ância',
|
||||
'ezas', 'eza', 'icos', 'icas', 'ismo', 'ável', 'ível', 'ista', 'oso',
|
||||
'osos', 'osas', 'osa', 'ico', 'ica', 'ador', 'aça~o', 'aço~es' , 'ante'))) !== false) {
|
||||
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// logía logías
|
||||
// replace with log if in R2
|
||||
if ( ($position = $this->search(array('logías', 'logía'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(logías|logía)$#u', 'log', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ución uciones
|
||||
// replace with u if in R2
|
||||
if ( ($position = $this->search(array('uciones', 'ución'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(uciones|ución)$#u', 'u', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ência ências
|
||||
// replace with ente if in R2
|
||||
if ( ($position = $this->search(array('ências', 'ência'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(ências|ência)$#u', 'ente', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// amente
|
||||
// delete if in R1
|
||||
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
|
||||
// if preceded by os, ic or ad, delete if in R2
|
||||
if ( ($position = $this->search(array('amente'))) !== false) {
|
||||
|
||||
// delete if in R1
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
|
||||
if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position3);
|
||||
}
|
||||
|
||||
// if preceded by os, ic or ad, delete if in R2
|
||||
} elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'ad'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position4);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// mente
|
||||
// delete if in R2
|
||||
// if preceded by ante, avel or ível, delete if in R2
|
||||
if ( ($position = $this->search(array('mente'))) !== false) {
|
||||
|
||||
// delete if in R2
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by ante, avel or ível, delete if in R2
|
||||
if ( ($position2 = $this->searchIfInR2(array('ante', 'avel', 'ível'))) != false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// idade idades
|
||||
// delete if in R2
|
||||
// if preceded by abil, ic or iv, delete if in R2
|
||||
if ( ($position = $this->search(array('idades', 'idade'))) !== false) {
|
||||
|
||||
// delete if in R2
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by abil, ic or iv, delete if in R2
|
||||
if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// iva ivo ivas ivos
|
||||
// delete if in R2
|
||||
// if preceded by at, delete if in R2
|
||||
if ( ($position = $this->search(array('ivas', 'ivos', 'iva', 'ivo'))) !== false) {
|
||||
|
||||
// delete if in R2
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by at, delete if in R2
|
||||
if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ira iras
|
||||
// replace with ir if in RV and preceded by e
|
||||
if ( ($position = $this->search(array('iras', 'ira'))) !== false) {
|
||||
|
||||
if ($this->inRv($position)) {
|
||||
$before = $position -1;
|
||||
$letter = UTF8::substr($this->word, $before, 1);
|
||||
|
||||
if ($letter == 'e') {
|
||||
$this->word = preg_replace('#(iras|ira)$#u', 'ir', $this->word);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2: Verb suffixes
|
||||
* Search for the longest among the following suffixes in RV, and if found, delete.
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'aríamos', 'eríamos', 'iríamos', 'ássemos', 'êssemos', 'íssemos',
|
||||
'aríeis', 'eríeis', 'iríeis', 'ásseis', 'ésseis', 'ísseis', 'áramos', 'éramos', 'íramos', 'ávamos',
|
||||
'aremos', 'eremos', 'iremos',
|
||||
'ariam', 'eriam', 'iriam', 'assem', 'essem', 'issem', 'arias', 'erias', 'irias', 'ardes', 'erdes', 'irdes',
|
||||
'asses', 'esses', 'isses', 'astes', 'estes', 'istes', 'áreis', 'areis', 'éreis', 'ereis', 'íreis', 'ireis',
|
||||
'áveis', 'íamos', 'armos', 'ermos', 'irmos',
|
||||
'aria', 'eria', 'iria', 'asse', 'esse', 'isse', 'aste', 'este', 'iste', 'arei', 'erei', 'irei', 'adas', 'idas',
|
||||
'aram', 'eram', 'iram', 'avam', 'arem', 'erem', 'irem', 'ando', 'endo', 'indo', 'ara~o', 'era~o', 'ira~o',
|
||||
'arás', 'aras', 'erás', 'eras', 'irás', 'avas', 'ares', 'eres', 'ires', 'íeis', 'ados', 'idos', 'ámos', 'amos',
|
||||
'emos', 'imos', 'iras',
|
||||
'ada', 'ida', 'ará', 'ara', 'erá', 'era', 'irá', 'ava', 'iam', 'ado', 'ido', 'ias', 'ais', 'eis', 'ira',
|
||||
'ia', 'ei', 'am', 'em', 'ar', 'er', 'ir', 'as', 'es', 'is', 'eu', 'iu', 'ou',
|
||||
))) !== false) {
|
||||
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3: d-suffixes
|
||||
*
|
||||
*/
|
||||
private function step3()
|
||||
{
|
||||
// Delete suffix i if in RV and preceded by c
|
||||
if ($this->searchIfInRv(array('i')) !== false) {
|
||||
$letter = UTF8::substr($this->word, -2, 1);
|
||||
|
||||
if ($letter == 'c') {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 4
|
||||
*/
|
||||
private function step4()
|
||||
{
|
||||
// If the word ends with one of the suffixes "os a i o á í ó" in RV, delete it
|
||||
if ( ($position = $this->searchIfInRv(array('os', 'a', 'i', 'o','á', 'í', 'ó'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 5
|
||||
*/
|
||||
private function step5()
|
||||
{
|
||||
// If the word ends with one of "e é ê" in RV, delete it, and if preceded by gu (or ci) with the u (or i) in RV, delete the u (or i).
|
||||
if ($this->searchIfInRv(array('e', 'é', 'ê')) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
|
||||
if ( ($position2 = $this->search(array('gu', 'ci'))) !== false) {
|
||||
if ($this->inRv(($position2+1))) {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} else if ($this->search(array('ç')) !== false) {
|
||||
$this->word = preg_replace('#(ç)$#u', 'c', $this->word);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finally
|
||||
*/
|
||||
private function finish()
|
||||
{
|
||||
// turn U and Y back into lower case, and remove the umlaut accent from a, o and u.
|
||||
$this->word = UTF8::str_replace(array('a~', 'o~'), array('ã', 'õ'), $this->word);
|
||||
}
|
||||
}
|
||||
334
libraries/vendor/wamania/php-stemmer/src/Stemmer/Romanian.php
vendored
Normal file
334
libraries/vendor/wamania/php-stemmer/src/Stemmer/Romanian.php
vendored
Normal file
@ -0,0 +1,334 @@
|
||||
<?php
|
||||
|
||||
namespace Wamania\Snowball\Stemmer;
|
||||
|
||||
use voku\helper\UTF8;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/romanian/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Romanian extends Stem
|
||||
{
|
||||
/**
|
||||
* All Romanian vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'ă', 'â', 'e', 'i', 'î', 'o', 'u');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (!UTF8::is_utf8($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = UTF8::strtolower($word);
|
||||
|
||||
$this->plainVowels = implode('', self::$vowels);
|
||||
|
||||
// First, i and u between vowels are put into upper case (so that they are treated as consonants).
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word);
|
||||
$this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word);
|
||||
|
||||
$this->rv();
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
|
||||
$this->step0();
|
||||
|
||||
$word1 = $this->word;
|
||||
$word2 = $this->word;
|
||||
|
||||
do {
|
||||
$word1 = $this->word;
|
||||
$this->step1();
|
||||
} while ($this->word != $word1);
|
||||
|
||||
$this->step2();
|
||||
|
||||
// Do step 3 if no suffix was removed either by step 1 or step 2.
|
||||
if ($word2 == $this->word) {
|
||||
$this->step3();
|
||||
}
|
||||
|
||||
$this->step4();
|
||||
$this->finish();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 0: Removal of plurals (and other simplifications)
|
||||
* Search for the longest among the following suffixes, and, if it is in R1, perform the action indicated.
|
||||
* @return boolean
|
||||
*/
|
||||
private function step0()
|
||||
{
|
||||
// ul ului
|
||||
// delete
|
||||
if ( ($position = $this->search(array('ul', 'ului'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// aua
|
||||
// replace with a
|
||||
if ( ($position = $this->search(array('aua'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(aua)$#u', 'a', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ea ele elor
|
||||
// replace with e
|
||||
if ( ($position = $this->search(array('ea', 'ele', 'elor'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(ea|ele|elor)$#u', 'e', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ii iua iei iile iilor ilor
|
||||
// replace with i
|
||||
if ( ($position = $this->search(array('ii', 'iua', 'iei', 'iile', 'iilor', 'ilor'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(ii|iua|iei|iile|iilor|ilor)$#u', 'i', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ile
|
||||
// replace with i if not preceded by ab
|
||||
if ( ($position = $this->search(array('ile'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$before = UTF8::substr($this->word, ($position-2), 2);
|
||||
|
||||
if ($before != 'ab') {
|
||||
$this->word = preg_replace('#(ile)$#u', 'i', $this->word);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// atei
|
||||
// replace with at
|
||||
if ( ($position = $this->search(array('atei'))) != false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(atei)$#u', 'at', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// aţie aţia
|
||||
// replace with aţi
|
||||
if ( ($position = $this->search(array('aţie', 'aţia'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(aţie|aţia)$#u', 'aţi', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1: Reduction of combining suffixes
|
||||
* Search for the longest among the following suffixes, and, if it is in R1, preform the replacement action indicated.
|
||||
* Then repeat this step until no replacement occurs.
|
||||
* @return boolean
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// abilitate abilitati abilităi abilităţi
|
||||
// replace with abil
|
||||
if ( ($position = $this->search(array('abilitate', 'abilitati', 'abilităi', 'abilităţi'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(abilitate|abilitati|abilităi|abilităţi)$#u', 'abil', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ibilitate
|
||||
// replace with ibil
|
||||
if ( ($position = $this->search(array('ibilitate'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(ibilitate)$#u', 'ibil', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ivitate ivitati ivităi ivităţi
|
||||
// replace with iv
|
||||
if ( ($position = $this->search(array('ivitate', 'ivitati', 'ivităi', 'ivităţi'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(ivitate|ivitati|ivităi|ivităţi)$#u', 'iv', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// icitate icitati icităi icităţi icator icatori iciv iciva icive icivi icivă ical icala icale icali icală
|
||||
// replace with ic
|
||||
if ( ($position = $this->search(array(
|
||||
'icitate', 'icitati', 'icităi', 'icităţi', 'icatori', 'icator', 'iciva',
|
||||
'icive', 'icivi', 'icivă', 'icala', 'icale', 'icali', 'icală', 'iciv', 'ical'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(icitate|icitati|icităi|icităţi|cator|icatori|iciva|icive|icivi|icivă|icala|icale|icali|icală|ical|iciv)$#u', 'ic', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ativ ativa ative ativi ativă aţiune atoare ator atori ătoare ător ători
|
||||
// replace with at
|
||||
if ( ($position = $this->search(array('ativa', 'ative', 'ativi', 'ativă', 'ativ', 'aţiune', 'atoare', 'atori', 'ătoare', 'ători', 'ător', 'ator'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(ativa|ative|ativi|ativă|ativ|aţiune|atoare|atori|ătoare|ători|ător|ator)$#u', 'at', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// itiv itiva itive itivi itivă iţiune itoare itor itori
|
||||
// replace with it
|
||||
if ( ($position = $this->search(array('itiva', 'itive', 'itivi', 'itivă', 'itiv', 'iţiune', 'itoare', 'itori', 'itor'))) !== false) {
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = preg_replace('#(itiva|itive|itivi|itivă|itiv|iţiune|itoare|itori|itor)$#u', 'it', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2: Removal of 'standard' suffixes
|
||||
* Search for the longest among the following suffixes, and, if it is in R2, perform the action indicated.
|
||||
* @return boolean
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
// atori itate itati, ităţi, abila abile abili abilă, ibila ibile ibili ibilă
|
||||
// anta, ante, anti, antă, ator, ibil, oasa oasă oase, ităi, abil
|
||||
// osi oşi ant ici ică iva ive ivi ivă ata ată ati ate, ata ată ati ate uta ută uti ute, ita ită iti ite ica ice
|
||||
// at, os, iv, ut, it, ic
|
||||
// delete
|
||||
if ( ($position = $this->search(array(
|
||||
'atori', 'itate', 'itati', 'ităţi', 'abila', 'abile', 'abili', 'abilă', 'ibila', 'ibile', 'ibili', 'ibilă',
|
||||
'anta', 'ante', 'anti', 'antă', 'ator', 'ibil', 'oasa', 'oasă', 'oase', 'ităi', 'abil',
|
||||
'osi', 'oşi', 'ant', 'ici', 'ică', 'iva', 'ive', 'ivi', 'ivă', 'ata', 'ată', 'ati', 'ate', 'ata', 'ată',
|
||||
'ati', 'ate', 'uta', 'ută', 'uti', 'ute', 'ita', 'ită', 'iti', 'ite', 'ica', 'ice',
|
||||
'at', 'os', 'iv', 'ut', 'it', 'ic'
|
||||
))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// iune iuni
|
||||
// delete if preceded by ţ, and replace the ţ by t.
|
||||
if ( ($position = $this->search(array('iune', 'iuni'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$before = $position - 1;
|
||||
$letter = UTF8::substr($this->word, $before, 1);
|
||||
if ($letter == 'ţ') {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
$this->word = preg_replace('#(ţ)$#u', 't', $this->word);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ism isme ist ista iste isti istă işti
|
||||
// replace with ist
|
||||
if ( ($position = $this->search(array('isme', 'ism', 'ista', 'iste', 'isti', 'istă', 'işti', 'ist'))) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(isme|ism|ista|iste|isti|istă|işti|ist)$#u', 'ist', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3: Removal of verb suffixes
|
||||
* Do step 3 if no suffix was removed either by step 1 or step 2.
|
||||
* @return boolean
|
||||
*/
|
||||
private function step3()
|
||||
{
|
||||
// are ere ire âre ind ând indu ându eze ească ez ezi ează esc eşti
|
||||
// eşte ăsc ăşti ăşte am ai au eam eai ea eaţi eau iam iai ia iaţi
|
||||
// iau ui aşi arăm arăţi ară uşi urăm urăţi ură işi irăm irăţi iră âi
|
||||
// âşi ârăm ârăţi âră asem aseşi ase aserăm aserăţi aseră isem iseşi ise
|
||||
// iserăm iserăţi iseră âsem âseşi âse âserăm âserăţi âseră usem useşi use userăm userăţi useră
|
||||
// delete if preceded in RV by a consonant or u
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'userăţi', 'iserăţi', 'âserăţi', 'aserăţi',
|
||||
'userăm', 'iserăm', 'âserăm', 'aserăm',
|
||||
'iseră', 'âseşi', 'useră', 'âseră', 'useşi', 'iseşi', 'aseră', 'aseşi', 'ârăţi', 'irăţi', 'urăţi', 'arăţi', 'ească',
|
||||
'usem', 'âsem', 'isem', 'asem', 'ârăm', 'urăm', 'irăm', 'arăm', 'iaţi', 'eaţi', 'ăşte', 'ăşti', 'eşte', 'eşti', 'ează', 'ându', 'indu',
|
||||
'âse', 'use', 'ise', 'ase', 'âră', 'iră', 'işi', 'ură', 'uşi', 'ară', 'aşi', 'âşi', 'iau', 'iai', 'iam', 'eau', 'eai', 'eam', 'ăsc',
|
||||
'are', 'ere', 'ire', 'âre', 'ind', 'ând', 'eze', 'ezi', 'esc',
|
||||
'âi', 'ui', 'ia', 'ea', 'au', 'ai', 'am', 'ez'
|
||||
))) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$before = $position - 1;
|
||||
if ($this->inRv($before)) {
|
||||
$letter = UTF8::substr($this->word, $before, 1);
|
||||
|
||||
if ( (!in_array($letter, self::$vowels)) || ($letter == 'u') ) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ăm aţi em eţi im iţi âm âţi seşi serăm serăţi seră sei se sesem seseşi sese seserăm seserăţi seseră
|
||||
// delete
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'seserăm', 'seserăţi', 'seseră', 'seseşi', 'sesem', 'serăţi', 'serăm', 'seşi', 'sese', 'seră',
|
||||
'aţi', 'eţi', 'iţi', 'âţi', 'sei', 'se', 'ăm', 'âm', 'em', 'im'
|
||||
))) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 4: Removal of final vowel
|
||||
*/
|
||||
private function step4()
|
||||
{
|
||||
// Search for the longest among the suffixes "a e i ie ă " and, if it is in RV, delete it.
|
||||
if ( ($position = $this->search(array('a', 'ie', 'e', 'i', 'ă'))) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finally
|
||||
* Turn I, U back into i, u
|
||||
*/
|
||||
private function finish()
|
||||
{
|
||||
// Turn I, U back into i, u
|
||||
$this->word = UTF8::str_replace(array('I', 'U'), array('i', 'u'), $this->word);
|
||||
}
|
||||
}
|
||||
252
libraries/vendor/wamania/php-stemmer/src/Stemmer/Russian.php
vendored
Normal file
252
libraries/vendor/wamania/php-stemmer/src/Stemmer/Russian.php
vendored
Normal file
@ -0,0 +1,252 @@
|
||||
<?php
|
||||
|
||||
namespace Wamania\Snowball\Stemmer;
|
||||
|
||||
use voku\helper\UTF8;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/russian/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Russian extends Stem
|
||||
{
|
||||
/**
|
||||
* All russian vowels
|
||||
*/
|
||||
protected static $vowels = array('а', 'е', 'и', 'о', 'у', 'ы', 'э', 'ю', 'я');
|
||||
|
||||
protected static $perfectiveGerund = array(
|
||||
array('вшись', 'вши', 'в'),
|
||||
array('ывшись', 'ившись', 'ывши', 'ивши', 'ив', 'ыв')
|
||||
);
|
||||
|
||||
protected static $adjective = array(
|
||||
'ыми', 'ими', 'ему', 'ому', 'его', 'ого', 'ее', 'ие', 'ые', 'ое', 'ей', 'ий',
|
||||
'ый', 'ой', 'ем', 'им', 'ым','ом','их', 'ых', 'ую', 'юю', 'ая', 'яя', 'ою', 'ею'
|
||||
);
|
||||
|
||||
protected static $participle = array(
|
||||
array('ем', 'нн', 'вш', 'ющ', 'щ'),
|
||||
array('ивш', 'ывш', 'ующ')
|
||||
);
|
||||
|
||||
protected static $reflexive = array('ся', 'сь');
|
||||
|
||||
protected static $verb = array(
|
||||
array('ешь', 'нно', 'ете', 'йте', 'ла', 'на', 'ли', 'й', 'л', 'ем', 'н', 'ло', 'но', 'ет', 'ют', 'ны', 'ть'),
|
||||
array(
|
||||
'уйте', 'ило', 'ыло', 'ено','ила', 'ыла', 'ена', 'ейте', 'ены', 'ить', 'ыть', 'ишь', 'ите', 'или', 'ыли',
|
||||
'ует', 'уют', 'ей', 'уй', 'ил', 'ыл', 'им', 'ым', 'ен', 'ят', 'ит', 'ыт', 'ую', 'ю'
|
||||
)
|
||||
);
|
||||
|
||||
protected static $noun = array(
|
||||
'иями', 'ями', 'ами', 'ией', 'иям', 'ием', 'иях', 'ев', 'ов', 'ие', 'ье', 'еи', 'ии', 'ей', 'ой', 'ий', 'ям',
|
||||
'ем', 'ам', 'ом', 'ах', 'ях', 'ию', 'ью', 'ия', 'ья', 'я', 'а', 'е', 'ы', 'ь', 'и', 'о', 'у', 'й', 'ю'
|
||||
);
|
||||
|
||||
protected static $superlative = array('ейше', 'ейш');
|
||||
|
||||
protected static $derivational = array('ость', 'ост');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (!UTF8::is_utf8($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = UTF8::strtolower($word);
|
||||
|
||||
// R2 is not used: R1 is defined in the same way as in the German stemmer
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
$this->rv();
|
||||
|
||||
// Do each of steps 1, 2 3 and 4.
|
||||
$this->step1();
|
||||
$this->step2();
|
||||
$this->step3();
|
||||
$this->step4();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1: Search for a PERFECTIVE GERUND ending. If one is found remove it, and that is then the end of step 1.
|
||||
* Otherwise try and remove a REFLEXIVE ending, and then search in turn for (1) an ADJECTIVAL, (2) a VERB or (3) a NOUN ending.
|
||||
* As soon as one of the endings (1) to (3) is found remove it, and terminate step 1.
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// Search for a PERFECTIVE GERUND ending.
|
||||
// group 1
|
||||
if ( ($position = $this->searchIfInRv(self::$perfectiveGerund[0])) !== false) {
|
||||
if ( ($this->inRv($position)) && ($this->checkGroup1($position)) ) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// group 2
|
||||
if ( ($position = $this->searchIfInRv(self::$perfectiveGerund[1])) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise try and remove a REFLEXIVE ending
|
||||
if ( ($position = $this->searchIfInRv(self::$reflexive)) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
}
|
||||
|
||||
// then search in turn for (1) an ADJECTIVAL, (2) a VERB or (3) a NOUN ending.
|
||||
// As soon as one of the endings (1) to (3) is found remove it, and terminate step 1.
|
||||
if ( ($position = $this->searchIfInRv(self::$adjective)) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
|
||||
if ( ($position2 = $this->search(self::$participle[0])) !== false) {
|
||||
if ( ($this->inRv($position2)) && ($this->checkGroup1($position2)) ) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if ( ($position2 = $this->search(self::$participle[1])) !== false) {
|
||||
if ($this->inRv($position2)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if ( ($position = $this->searchIfInRv(self::$verb[0])) !== false) {
|
||||
if ( ($this->inRv($position)) && ($this->checkGroup1($position)) ) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if ( ($position = $this->searchIfInRv(self::$verb[1])) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if ( ($position = $this->searchIfInRv(self::$noun)) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2: If the word ends with и (i), remove it.
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
if ( ($position = $this->searchIfInRv(array('и'))) !== false) {
|
||||
if ($this->inRv($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3: Search for a DERIVATIONAL ending in R2 (i.e. the entire ending must lie in R2),
|
||||
* and if one is found, remove it.
|
||||
*/
|
||||
private function step3()
|
||||
{
|
||||
if ( ($position = $this->searchIfInRv(self::$derivational)) !== false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 4: (1) Undouble н (n), or, (2) if the word ends with a SUPERLATIVE ending, remove it
|
||||
* and undouble н (n), or (3) if the word ends ь (') (soft sign) remove it.
|
||||
*/
|
||||
private function step4()
|
||||
{
|
||||
// (2) if the word ends with a SUPERLATIVE ending, remove it
|
||||
if ( ($position = $this->searchIfInRv(self::$superlative)) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// (1) Undouble н (n)
|
||||
if ( ($position = $this->searchIfInRv(array('нн'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, ($position+1));
|
||||
return true;
|
||||
}
|
||||
|
||||
// (3) if the word ends ь (') (soft sign) remove it
|
||||
if ( ($position = $this->searchIfInRv(array('ь'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* In any word, RV is the region after the first vowel, or the end of the word if it contains no vowel.
|
||||
*/
|
||||
protected function rv()
|
||||
{
|
||||
$length = UTF8::strlen($this->word);
|
||||
|
||||
$this->rv = '';
|
||||
$this->rvIndex = $length;
|
||||
|
||||
for ($i=0; $i<$length; $i++) {
|
||||
$letter = UTF8::substr($this->word, $i, 1);
|
||||
if (in_array($letter, self::$vowels)) {
|
||||
$this->rv = UTF8::substr($this->word, ($i+1));
|
||||
$this->rvIndex = $i + 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* group 1 endings must follow а (a) or я (ia)
|
||||
*
|
||||
* @param integer $position
|
||||
* @return boolean
|
||||
*/
|
||||
private function checkGroup1($position)
|
||||
{
|
||||
if (! $this->inRv(($position-1))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$letter = UTF8::substr($this->word, ($position - 1), 1);
|
||||
|
||||
if ($letter == 'а' || $letter == 'я') {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
348
libraries/vendor/wamania/php-stemmer/src/Stemmer/Spanish.php
vendored
Normal file
348
libraries/vendor/wamania/php-stemmer/src/Stemmer/Spanish.php
vendored
Normal file
@ -0,0 +1,348 @@
|
||||
<?php
|
||||
|
||||
namespace Wamania\Snowball\Stemmer;
|
||||
|
||||
use voku\helper\UTF8;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/spanish/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Spanish extends Stem
|
||||
{
|
||||
/**
|
||||
* All spanish vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'á', 'é', 'í', 'ó', 'ú', 'ü');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (!UTF8::is_utf8($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = UTF8::strtolower($word);
|
||||
|
||||
$this->rv();
|
||||
$this->r1();
|
||||
$this->r2();
|
||||
|
||||
$this->step0();
|
||||
|
||||
$word = $this->word;
|
||||
$this->step1();
|
||||
|
||||
// Do step 2a if no ending was removed by step 1.
|
||||
if ($this->word == $word) {
|
||||
$this->step2a();
|
||||
|
||||
// Do Step 2b if step 2a was done, but failed to remove a suffix.
|
||||
if ($this->word == $word) {
|
||||
$this->step2b();
|
||||
}
|
||||
}
|
||||
|
||||
$this->step3();
|
||||
$this->finish();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 0: Attached pronoun
|
||||
*
|
||||
* Search for the longest among the following suffixes
|
||||
* me se sela selo selas selos la le lo las les los nos
|
||||
*
|
||||
* and delete it, if comes after one of
|
||||
* (a) iéndo ándo ár ér ír
|
||||
* (b) ando iendo ar er ir
|
||||
* (c) yendo following u
|
||||
*
|
||||
* in RV. In the case of (c), yendo must lie in RV, but the preceding u can be outside it.
|
||||
* In the case of (a), deletion is followed by removing the acute accent (for example, haciéndola -> haciendo).
|
||||
*/
|
||||
private function step0()
|
||||
{
|
||||
if ( ($position = $this->searchIfInRv(array('selas', 'selos', 'las', 'los', 'les', 'nos', 'selo', 'sela', 'me', 'se', 'la', 'le', 'lo' ))) != false) {
|
||||
$suffixe = UTF8::substr($this->word, $position);
|
||||
|
||||
// a
|
||||
$a = array('iéndo', 'ándo', 'ár', 'ér', 'ír');
|
||||
$a = array_map(function($item) use ($suffixe) {
|
||||
return $item . $suffixe;
|
||||
}, $a);
|
||||
|
||||
if ( ($position2 = $this->searchIfInRv($a)) !== false) {
|
||||
$suffixe2 = UTF8::substr($this->word, $position2);
|
||||
$suffixe2 = UTF8::to_utf8(UTF8::to_ascii($suffixe2)); // unaccent
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
$this->word .= $suffixe2;
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// b
|
||||
$b = array('iendo', 'ando', 'ar', 'er', 'ir');
|
||||
$b = array_map(function($item) use ($suffixe) {
|
||||
return $item . $suffixe;
|
||||
}, $b);
|
||||
|
||||
if ( ($position2 = $this->searchIfInRv($b)) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// c
|
||||
if ( ($position2 = $this->searchIfInRv(array('yendo' . $suffixe))) != false) {
|
||||
$before = UTF8::substr($this->word, ($position2-1), 1);
|
||||
if ( (isset($before)) && ($before == 'u') ) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// anza anzas ico ica icos icas ismo ismos able ables ible ibles ista
|
||||
// istas oso osa osos osas amiento amientos imiento imientos
|
||||
// delete if in R2
|
||||
if ( ($position = $this->search(array(
|
||||
'imientos', 'imiento', 'amientos', 'amiento', 'osas', 'osos', 'osa', 'oso', 'istas', 'ista', 'ibles',
|
||||
'ible', 'ables', 'able', 'ismos', 'ismo', 'icas', 'icos', 'ica', 'ico', 'anzas', 'anza'))) != false) {
|
||||
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// adora ador ación adoras adores aciones ante antes ancia ancias
|
||||
// delete if in R2
|
||||
// if preceded by ic, delete if in R2
|
||||
if ( ($position = $this->search(array(
|
||||
'adoras', 'adora', 'aciones', 'ación', 'adores', 'ador', 'antes', 'ante', 'ancias', 'ancia'))) != false) {
|
||||
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
if ( ($position2 = $this->searchIfInR2(array('ic')))) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// logía logías
|
||||
// replace with log if in R2
|
||||
if ( ($position = $this->search(array('logías', 'logía'))) != false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(logías|logía)$#u', 'log', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ución uciones
|
||||
// replace with u if in R2
|
||||
if ( ($position = $this->search(array('uciones', 'ución'))) != false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(uciones|ución)$#u', 'u', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// encia encias
|
||||
// replace with ente if in R2
|
||||
if ( ($position = $this->search(array('encias', 'encia'))) != false) {
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = preg_replace('#(encias|encia)$#u', 'ente', $this->word);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// amente
|
||||
// delete if in R1
|
||||
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
|
||||
// if preceded by os, ic or ad, delete if in R2
|
||||
if ( ($position = $this->search(array('amente'))) != false) {
|
||||
|
||||
// delete if in R1
|
||||
if ($this->inR1($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise,
|
||||
if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position3);
|
||||
}
|
||||
|
||||
// if preceded by os, ic or ad, delete if in R2
|
||||
} elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'ad'))) != false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position4);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// mente
|
||||
// delete if in R2
|
||||
// if preceded by ante, able or ible, delete if in R2
|
||||
if ( ($position = $this->search(array('mente'))) != false) {
|
||||
|
||||
// delete if in R2
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by ante, able or ible, delete if in R2
|
||||
if ( ($position2 = $this->searchIfInR2(array('ante', 'able', 'ible'))) != false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// idad idades
|
||||
// delete if in R2
|
||||
// if preceded by abil, ic or iv, delete if in R2
|
||||
if ( ($position = $this->search(array('idades', 'idad'))) != false) {
|
||||
|
||||
// delete if in R2
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by abil, ic or iv, delete if in R2
|
||||
if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) != false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// iva ivo ivas ivos
|
||||
// delete if in R2
|
||||
// if preceded by at, delete if in R2
|
||||
if ( ($position = $this->search(array('ivas', 'ivos', 'iva', 'ivo'))) != false) {
|
||||
|
||||
// delete if in R2
|
||||
if ($this->inR2($position)) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
}
|
||||
|
||||
// if preceded by at, delete if in R2
|
||||
if ( ($position2 = $this->searchIfInR2(array('at'))) != false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2a: Verb suffixes beginning y
|
||||
*/
|
||||
private function step2a()
|
||||
{
|
||||
// if found, delete if preceded by u
|
||||
// (Note that the preceding u need not be in RV.)
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'yamos', 'yendo', 'yeron', 'yan', 'yen', 'yais', 'yas', 'yes', 'yo', 'yó', 'ya', 'ye'))) != false) {
|
||||
|
||||
$before = UTF8::substr($this->word, ($position-1), 1);
|
||||
if ( (isset($before)) && ($before == 'u') ) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2b: Other verb suffixes
|
||||
* Search for the longest among the following suffixes in RV, and perform the action indicated.
|
||||
*/
|
||||
private function step2b()
|
||||
{
|
||||
// delete
|
||||
if ( ($position = $this->searchIfInRv(array(
|
||||
'iésemos', 'iéramos', 'ábamos', 'iríamos', 'eríamos', 'aríamos', 'áramos', 'ásemos', 'eríais',
|
||||
'aremos', 'eremos', 'iremos', 'asteis', 'ieseis', 'ierais', 'isteis', 'aríais',
|
||||
'irían', 'aréis', 'erían', 'erías', 'eréis', 'iréis', 'irías', 'ieran', 'iesen', 'ieron', 'iendo', 'ieras',
|
||||
'iríais', 'arían', 'arías',
|
||||
'amos', 'imos', 'ados', 'idos', 'irán', 'irás', 'erán', 'erás', 'ería', 'iría', 'íais', 'arán', 'arás', 'aría',
|
||||
'iera', 'iese', 'aste', 'iste', 'aban', 'aran', 'asen', 'aron', 'ando', 'abas', 'adas', 'idas', 'ases', 'aras',
|
||||
'aré', 'erá', 'eré', 'áis', 'ías', 'irá', 'iré', 'aba', 'ían', 'ada', 'ara', 'ase', 'ida', 'ado', 'ido', 'ará',
|
||||
'ad', 'ed', 'id', 'ís', 'ió', 'ar', 'er', 'ir', 'as', 'ía', 'an'
|
||||
))) != false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// en es éis emos
|
||||
// delete, and if preceded by gu delete the u (the gu need not be in RV)
|
||||
if ( ($position = $this->searchIfInRv(array('éis', 'emos', 'en', 'es'))) != false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
|
||||
if ( ($position2 = $this->search(array('gu'))) != false) {
|
||||
$this->word = UTF8::substr($this->word, 0, ($position2+1));
|
||||
}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3: residual suffix
|
||||
* Search for the longest among the following suffixes in RV, and perform the action indicated.
|
||||
*/
|
||||
private function step3()
|
||||
{
|
||||
// os a o á í ó
|
||||
// delete if in RV
|
||||
if ( ($position = $this->searchIfInRv(array('os', 'a', 'o', 'á', 'í', 'ó'))) != false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// e é
|
||||
// delete if in RV, and if preceded by gu with the u in RV delete the u
|
||||
if ( ($position = $this->searchIfInRv(array('e', 'é'))) != false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
|
||||
if ( ($position2 = $this->searchIfInRv(array('u'))) != false) {
|
||||
$before = UTF8::substr($this->word, ($position2-1), 1);
|
||||
if ( (isset($before)) && ($before == 'g') ) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position2);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* And finally:
|
||||
* Remove acute accents
|
||||
*/
|
||||
private function finish()
|
||||
{
|
||||
$this->word = UTF8::str_replace(array('á', 'í', 'ó', 'é', 'ú'), array('a', 'i', 'o', 'e', 'u'), $this->word);
|
||||
}
|
||||
}
|
||||
221
libraries/vendor/wamania/php-stemmer/src/Stemmer/Stem.php
vendored
Normal file
221
libraries/vendor/wamania/php-stemmer/src/Stemmer/Stem.php
vendored
Normal file
@ -0,0 +1,221 @@
|
||||
<?php
|
||||
|
||||
namespace Wamania\Snowball\Stemmer;
|
||||
|
||||
use voku\helper\UTF8;
|
||||
|
||||
abstract class Stem implements Stemmer
|
||||
{
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y');
|
||||
|
||||
/**
|
||||
* helper, contains stringified list of vowels
|
||||
* @var string
|
||||
*/
|
||||
protected $plainVowels;
|
||||
|
||||
/**
|
||||
* The word we are stemming
|
||||
* @var string
|
||||
*/
|
||||
protected $word;
|
||||
|
||||
/**
|
||||
* The original word, use to check if word has been modified
|
||||
* @var string
|
||||
*/
|
||||
protected $originalWord;
|
||||
|
||||
/**
|
||||
* RV value
|
||||
* @var string
|
||||
*/
|
||||
protected $rv;
|
||||
|
||||
/**
|
||||
* RV index (based on the beginning of the word)
|
||||
* @var integer
|
||||
*/
|
||||
protected $rvIndex;
|
||||
|
||||
/**
|
||||
* R1 value
|
||||
* @var integer
|
||||
*/
|
||||
protected $r1;
|
||||
|
||||
/**
|
||||
* R1 index (based on the beginning of the word)
|
||||
* @var int
|
||||
*/
|
||||
protected $r1Index;
|
||||
|
||||
/**
|
||||
* R2 value
|
||||
* @var integer
|
||||
*/
|
||||
protected $r2;
|
||||
|
||||
/**
|
||||
* R2 index (based on the beginning of the word)
|
||||
* @var int
|
||||
*/
|
||||
protected $r2Index;
|
||||
|
||||
protected function inRv($position)
|
||||
{
|
||||
return ($position >= $this->rvIndex);
|
||||
}
|
||||
|
||||
protected function inR1($position)
|
||||
{
|
||||
return ($position >= $this->r1Index);
|
||||
}
|
||||
|
||||
protected function inR2($position)
|
||||
{
|
||||
return ($position >= $this->r2Index);
|
||||
}
|
||||
|
||||
protected function searchIfInRv($suffixes)
|
||||
{
|
||||
return $this->search($suffixes, $this->rvIndex);
|
||||
}
|
||||
|
||||
protected function searchIfInR1($suffixes)
|
||||
{
|
||||
return $this->search($suffixes, $this->r1Index);
|
||||
}
|
||||
|
||||
protected function searchIfInR2($suffixes)
|
||||
{
|
||||
return $this->search($suffixes, $this->r2Index);
|
||||
}
|
||||
|
||||
protected function search($suffixes, $offset = 0)
|
||||
{
|
||||
$length = UTF8::strlen($this->word);
|
||||
if ($offset > $length) {
|
||||
return false;
|
||||
}
|
||||
foreach ($suffixes as $suffixe) {
|
||||
if ( (($position = UTF8::strrpos($this->word, $suffixe, $offset)) !== false) && ((Utf8::strlen($suffixe)+$position) == $length) ) {
|
||||
return $position;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
|
||||
*/
|
||||
protected function r1()
|
||||
{
|
||||
list($this->r1Index, $this->r1) = $this->rx($this->word);
|
||||
}
|
||||
|
||||
/**
|
||||
* R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel.
|
||||
*/
|
||||
protected function r2()
|
||||
{
|
||||
list($index, $value) = $this->rx($this->r1);
|
||||
|
||||
$this->r2 = $value;
|
||||
$this->r2Index = $this->r1Index + $index;
|
||||
}
|
||||
|
||||
/**
|
||||
* Common function for R1 and R2
|
||||
* Search the region after the first non-vowel following a vowel in $word, or the end of the word if there is no such non-vowel.
|
||||
* R1 : $in = $this->word
|
||||
* R2 : $in = R1
|
||||
*/
|
||||
protected function rx($in)
|
||||
{
|
||||
$length = UTF8::strlen($in);
|
||||
|
||||
// defaults
|
||||
$value = '';
|
||||
$index = $length;
|
||||
|
||||
// we search all vowels
|
||||
$vowels = array();
|
||||
for ($i=0; $i<$length; $i++) {
|
||||
$letter = UTF8::substr($in, $i, 1);
|
||||
if (in_array($letter, static::$vowels)) {
|
||||
$vowels[] = $i;
|
||||
}
|
||||
}
|
||||
|
||||
// search the non-vowel following a vowel
|
||||
foreach ($vowels as $position) {
|
||||
$after = $position + 1;
|
||||
$letter = UTF8::substr($in, $after, 1);
|
||||
|
||||
if (! in_array($letter, static::$vowels)) {
|
||||
$index = $after + 1;
|
||||
$value = UTF8::substr($in, ($after+1));
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return array($index, $value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Used by spanish, italian, portuguese, etc (but not by french)
|
||||
*
|
||||
* If the second letter is a consonant, RV is the region after the next following vowel,
|
||||
* or if the first two letters are vowels, RV is the region after the next consonant,
|
||||
* and otherwise (consonant-vowel case) RV is the region after the third letter.
|
||||
* But RV is the end of the word if these positions cannot be found.
|
||||
*/
|
||||
protected function rv()
|
||||
{
|
||||
$length = UTF8::strlen($this->word);
|
||||
|
||||
$this->rv = '';
|
||||
$this->rvIndex = $length;
|
||||
|
||||
if ($length < 3) {
|
||||
return true;
|
||||
}
|
||||
|
||||
$first = UTF8::substr($this->word, 0, 1);
|
||||
$second = UTF8::substr($this->word, 1, 1);
|
||||
|
||||
// If the second letter is a consonant, RV is the region after the next following vowel,
|
||||
if (!in_array($second, static::$vowels)) {
|
||||
for ($i=2; $i<$length; $i++) {
|
||||
$letter = UTF8::substr($this->word, $i, 1);
|
||||
if (in_array($letter, static::$vowels)) {
|
||||
$this->rvIndex = $i + 1;
|
||||
$this->rv = UTF8::substr($this->word, ($i+1));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// or if the first two letters are vowels, RV is the region after the next consonant,
|
||||
if ( (in_array($first, static::$vowels)) && (in_array($second, static::$vowels)) ) {
|
||||
for ($i=2; $i<$length; $i++) {
|
||||
$letter = UTF8::substr($this->word, $i, 1);
|
||||
if (! in_array($letter, static::$vowels)) {
|
||||
$this->rvIndex = $i + 1;
|
||||
$this->rv = UTF8::substr($this->word, ($i+1));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// and otherwise (consonant-vowel case) RV is the region after the third letter.
|
||||
if ( (! in_array($first, static::$vowels)) && (in_array($second, static::$vowels)) ) {
|
||||
$this->rv = UTF8::substr($this->word, 3);
|
||||
$this->rvIndex = 3;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
19
libraries/vendor/wamania/php-stemmer/src/Stemmer/Stemmer.php
vendored
Normal file
19
libraries/vendor/wamania/php-stemmer/src/Stemmer/Stemmer.php
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
<?php
|
||||
namespace Wamania\Snowball\Stemmer;
|
||||
|
||||
/**
|
||||
* @author Luís Cobucci <lcobucci@gmail.com>
|
||||
*/
|
||||
interface Stemmer
|
||||
{
|
||||
/**
|
||||
* Main function to get the STEM of a word
|
||||
*
|
||||
* @param string $word A valid UTF-8 word
|
||||
*
|
||||
* @return string
|
||||
*
|
||||
* @throws \Exception
|
||||
*/
|
||||
public function stem($word);
|
||||
}
|
||||
130
libraries/vendor/wamania/php-stemmer/src/Stemmer/Swedish.php
vendored
Normal file
130
libraries/vendor/wamania/php-stemmer/src/Stemmer/Swedish.php
vendored
Normal file
@ -0,0 +1,130 @@
|
||||
<?php
|
||||
|
||||
namespace Wamania\Snowball\Stemmer;
|
||||
|
||||
use voku\helper\UTF8;
|
||||
|
||||
/**
|
||||
*
|
||||
* @link http://snowball.tartarus.org/algorithms/swedish/stemmer.html
|
||||
* @author wamania
|
||||
*
|
||||
*/
|
||||
class Swedish extends Stem
|
||||
{
|
||||
/**
|
||||
* All swedish vowels
|
||||
*/
|
||||
protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'ä', 'å', 'ö');
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function stem($word)
|
||||
{
|
||||
// we do ALL in UTF-8
|
||||
if (!UTF8::is_utf8($word)) {
|
||||
throw new \Exception('Word must be in UTF-8');
|
||||
}
|
||||
|
||||
$this->word = UTF8::strtolower($word);
|
||||
|
||||
// R2 is not used: R1 is defined in the same way as in the German stemmer
|
||||
$this->r1();
|
||||
|
||||
// then R1 is adjusted so that the region before it contains at least 3 letters.
|
||||
if ($this->r1Index < 3) {
|
||||
$this->r1Index = 3;
|
||||
$this->r1 = UTF8::substr($this->word, 3);
|
||||
}
|
||||
|
||||
// Do each of steps 1, 2 3 and 4.
|
||||
$this->step1();
|
||||
$this->step2();
|
||||
$this->step3();
|
||||
|
||||
return $this->word;
|
||||
}
|
||||
|
||||
/**
|
||||
* Define a valid s-ending as one of
|
||||
* b c d f g h j k l m n o p r t v y
|
||||
*
|
||||
* @param string $ending
|
||||
* @return boolean
|
||||
*/
|
||||
private function hasValidSEnding($word)
|
||||
{
|
||||
$lastLetter = UTF8::substr($word, -1, 1);
|
||||
return in_array($lastLetter, array('b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1
|
||||
* Search for the longest among the following suffixes in R1, and perform the action indicated.
|
||||
*/
|
||||
private function step1()
|
||||
{
|
||||
// a arna erna heterna orna ad e ade ande arne are aste en anden aren heten
|
||||
// ern ar er heter or as arnas ernas ornas es ades andes ens arens hetens
|
||||
// erns at andet het ast
|
||||
// delete
|
||||
if ( ($position = $this->searchIfInR1(array(
|
||||
'heterna', 'hetens', 'ornas', 'andes', 'arnas', 'heter', 'ernas', 'anden', 'heten', 'andet', 'arens',
|
||||
'orna', 'arna', 'erna', 'aren', 'ande', 'ades', 'arne', 'erns', 'aste', 'ade', 'ern', 'het',
|
||||
'ast', 'are', 'ens', 'or', 'es', 'ad', 'en', 'at', 'ar', 'as', 'er', 'a', 'e'
|
||||
))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// s
|
||||
// delete if preceded by a valid s-ending
|
||||
if ( ($position = $this->searchIfInR1(array('s'))) !== false) {
|
||||
$word = UTF8::substr($this->word, 0, $position);
|
||||
if ($this->hasValidSEnding($word)) {
|
||||
$this->word = $word;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2
|
||||
* Search for one of the following suffixes in R1, and if found delete the last letter.
|
||||
*/
|
||||
private function step2()
|
||||
{
|
||||
// dd gd nn dt gt kt tt
|
||||
if ($this->searchIfInR1(array('dd', 'gd', 'nn', 'dt', 'gt', 'kt', 'tt')) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 3:
|
||||
* Search for the longest among the following suffixes in R1, and perform the action indicated.
|
||||
*/
|
||||
private function step3()
|
||||
{
|
||||
// lig ig els
|
||||
// delete
|
||||
if ( ($position = $this->searchIfInR1(array('lig', 'ig', 'els'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, $position);
|
||||
return true;
|
||||
}
|
||||
|
||||
// löst
|
||||
// replace with lös
|
||||
if ( ($this->searchIfInR1(array('löst'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
return true;
|
||||
}
|
||||
|
||||
// fullt
|
||||
// replace with full
|
||||
if ( ($this->searchIfInR1(array('fullt'))) !== false) {
|
||||
$this->word = UTF8::substr($this->word, 0, -1);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
56
libraries/vendor/wamania/php-stemmer/src/StemmerFactory.php
vendored
Normal file
56
libraries/vendor/wamania/php-stemmer/src/StemmerFactory.php
vendored
Normal file
@ -0,0 +1,56 @@
|
||||
<?php
|
||||
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
use voku\helper\UTF8;
|
||||
use Wamania\Snowball\Stemmer\Catalan;
|
||||
use Wamania\Snowball\Stemmer\Danish;
|
||||
use Wamania\Snowball\Stemmer\Dutch;
|
||||
use Wamania\Snowball\Stemmer\English;
|
||||
use Wamania\Snowball\Stemmer\Finnish;
|
||||
use Wamania\Snowball\Stemmer\French;
|
||||
use Wamania\Snowball\Stemmer\German;
|
||||
use Wamania\Snowball\Stemmer\Italian;
|
||||
use Wamania\Snowball\Stemmer\Norwegian;
|
||||
use Wamania\Snowball\Stemmer\Portuguese;
|
||||
use Wamania\Snowball\Stemmer\Romanian;
|
||||
use Wamania\Snowball\Stemmer\Russian;
|
||||
use Wamania\Snowball\Stemmer\Spanish;
|
||||
use Wamania\Snowball\Stemmer\Stemmer;
|
||||
use Wamania\Snowball\Stemmer\Swedish;
|
||||
|
||||
class StemmerFactory
|
||||
{
|
||||
const LANGS = [
|
||||
Catalan::class => ['ca', 'cat', 'catalan'],
|
||||
Danish::class => ['da', 'dan', 'danish'],
|
||||
Dutch::class => ['nl', 'dut', 'nld', 'dutch'],
|
||||
English::class => ['en', 'eng', 'english'],
|
||||
Finnish::class => ['fi', 'fin', 'finnish'],
|
||||
French::class => ['fr', 'fre', 'fra', 'french'],
|
||||
German::class => ['de', 'deu', 'ger', 'german'],
|
||||
Italian::class => ['it', 'ita', 'italian'],
|
||||
Norwegian::class => ['no', 'nor', 'norwegian'],
|
||||
Portuguese::class => ['pt', 'por', 'portuguese'],
|
||||
Romanian::class => ['ro', 'rum', 'ron', 'romanian'],
|
||||
Russian::class => ['ru', 'rus', 'russian'],
|
||||
Spanish::class => ['es', 'spa', 'spanish'],
|
||||
Swedish::class => ['sv', 'swe', 'swedish']
|
||||
];
|
||||
|
||||
/**
|
||||
* @throws NotFoundException
|
||||
*/
|
||||
public static function create(string $code): Stemmer
|
||||
{
|
||||
$code = UTF8::strtolower($code);
|
||||
|
||||
foreach (self::LANGS as $classname => $isoCodes) {
|
||||
if (in_array($code, $isoCodes)) {
|
||||
return new $classname;
|
||||
}
|
||||
}
|
||||
|
||||
throw new NotFoundException(sprintf('Stemmer not found for %s', $code));
|
||||
}
|
||||
}
|
||||
26
libraries/vendor/wamania/php-stemmer/src/StemmerManager.php
vendored
Normal file
26
libraries/vendor/wamania/php-stemmer/src/StemmerManager.php
vendored
Normal file
@ -0,0 +1,26 @@
|
||||
<?php
|
||||
|
||||
namespace Wamania\Snowball;
|
||||
|
||||
class StemmerManager
|
||||
{
|
||||
/** @var array */
|
||||
private $stemmers;
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->stemmers = [];
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws NotFoundException
|
||||
*/
|
||||
public function stem(string $word, string $isoCode): string
|
||||
{
|
||||
if (!isset($this->stemmers[$isoCode])) {
|
||||
$this->stemmers[$isoCode] = StemmerFactory::create($isoCode);
|
||||
}
|
||||
|
||||
return $this->stemmers[$isoCode]->stem($word);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user