@@ -73,29 +73,63 @@ public function getLexicalDiversity(string $lexicalDiversityClassName = Naive::c
7373 /**
7474 * See https://stackoverflow.com/questions/15737408/php-find-all-occurrences-of-a-substring-in-a-string
7575 * @param string $needle
76- * @param int $spacing The amount of space left and right of the found needle
76+ * @param int $contextLength The amount of space left and right of the found needle
77+ * @param bool $ignorecase
78+ * @param int $position. Available options: contain, begin, end, equal.
7779 * @return array
7880 */
79- public function concordance (string $ needle , int $ spacing = 20 ) : array
81+ public function concordance (string $ needle , int $ contextLength = 20 , bool $ ignorecase = true , string $ position = ' contain ' ) : array
8082 {
81- $ position = 0 ;
83+ // temporary solution to handle unicode chars
84+ $ this ->text = utf8_decode ($ this ->text );
85+ $ needle = utf8_decode ($ needle );
86+
8287 $ found = [];
83- $ text = trim (preg_replace ('/[\s\t\n\r\s]+/ ' , ' ' , $ this ->text ));
88+ $ text = ' ' . trim (preg_replace ('/[\s\t\n\r\s]+/ ' , ' ' , $ this ->text )) . ' ' ;
8489 $ needleLength = strlen ($ needle );
8590 $ textLength = strlen ($ text );
86- $ bufferLength = $ needleLength + 2 * $ spacing ;
87-
88- while (($ position = stripos ($ text , $ needle , $ position ))!== false )
89- {
90- $ left = max ($ position - $ spacing , 0 );
91- if ($ needleLength + $ spacing + $ position > $ textLength ) {
92- $ tmp = substr ($ text , $ left );
93- } else {
91+ $ bufferLength = $ needleLength + 2 * $ contextLength ;
92+
93+ // \p{L} or \p{Letter}: any kind of letter from any language.
94+
95+ $ special_chars = "\/\-_\' " ;
96+ $ word_part = '\p{L} ' .$ special_chars ;
97+
98+ switch ($ position ) {
99+ case 'equal ' :
100+ $ pattern = "/[^ $ word_part]( $ needle)[^ $ word_part]/ " ;
101+ break ;
102+ case 'begin ' :
103+ $ pattern = "/[^ $ word_part]( $ needle)[ $ special_chars]?[\p{L}]*|^( $ needle)/ " ;
104+ break ;
105+ case 'end ' :
106+ $ pattern = "/[\p{L}]*[ $ special_chars]?[\p{L}]*( $ needle)[^ $ word_part]/ " ;
107+ break ;
108+ case 'contain ' :
109+ $ pattern = "/( $ needle)/ " ;
110+ break ;
111+ default :
112+ $ pattern = "/( $ needle)/ " ;
113+ break ;
114+ }
115+
116+ $ case = $ ignorecase ? 'i ' : '' ;
117+ preg_match_all ($ pattern .$ case , $ text , $ matches , PREG_OFFSET_CAPTURE );
118+
119+ // Getting excerpts
120+ foreach ($ matches [1 ] as $ match ) {
121+
122+ $ needlePosition = $ match [1 ];
123+ $ left = max ($ needlePosition - $ contextLength , 0 );
124+
125+ if ($ needleLength + $ contextLength + $ needlePosition > $ textLength ) {
126+ $ tmp = substr ($ text , $ left );
127+ } else {
94128 $ tmp = substr ($ text , $ left , $ bufferLength );
95- }
96- $ found [] = $ tmp ;
97- $ position += $ needleLength ;
129+ }
130+ $ found [] = utf8_encode ($ tmp );
98131 }
132+
99133 return $ found ;
100134 }
101135
0 commit comments