44
55use TextAnalysis \Tokenizers \GeneralTokenizer ;
66use TextAnalysis \LexicalDiversity \Naive ;
7+ use TextAnalysis \Utilities \Text ;
78
89/**
910 * Explore the text corpus
1011 * @author yooper
1112 */
12- class TextCorpus
13+ class TextCorpus
1314{
1415 /**
1516 *
1617 * @var string
1718 */
1819 protected $ text ;
19-
20+
2021 /**
2122 *
2223 * @var array
2324 */
2425 protected $ tokens = [];
25-
26- public function __construct (string $ text )
26+
27+ public function __construct (string $ text )
2728 {
2829 $ this ->text = $ text ;
2930 }
30-
31+
3132 /**
3233 * Returns the original text
3334 * @return string
@@ -36,15 +37,15 @@ public function getText() : string
3637 {
3738 return $ this ->text ;
3839 }
39-
40+
4041 public function getTokens (string $ tokenizerClassName = GeneralTokenizer::class) : array
4142 {
42- if (empty ($ this ->tokens )) {
43+ if (empty ($ this ->tokens )) {
4344 $ this ->tokens = tokenize ($ this ->getText (), $ tokenizerClassName );
4445 }
4546 return $ this ->tokens ;
4647 }
47-
48+
4849 /**
4950 * Return a list of positions that the needs were found in the text
5051 * @param array $needles
@@ -59,7 +60,7 @@ public function getDispersion(array $needles) : array
5960 }
6061 return $ found ;
6162 }
62-
63+
6364 /**
6465 * Compute the lexical diversity, the default uses a naive algorithm
6566 * @param string $lexicalDiversityClassName
@@ -69,23 +70,51 @@ public function getLexicalDiversity(string $lexicalDiversityClassName = Naive::c
6970 {
7071 return lexical_diversity ($ this ->getTokens (), $ lexicalDiversityClassName );
7172 }
72-
73+
7374 /**
7475 * See https://stackoverflow.com/questions/15737408/php-find-all-occurrences-of-a-substring-in-a-string
7576 * @param string $needle
7677 * @param int $contextLength The amount of space left and right of the found needle
7778 * @param bool $ignorecase
7879 * @param int $position. Available options: contain, begin, end, equal.
80+ * @param bool $mark Option to mark the needle
7981 * @return array
8082 */
81- public function concordance (string $ needle , int $ contextLength = 20 , bool $ ignorecase = true , string $ position = 'contain ' ) : array
83+ public function concordance (string $ needle , int $ contextLength = 20 , bool $ ignorecase = true , string $ position = 'contain ' , bool $ mark = false ) : array
8284 {
8385 // temporary solution to handle unicode chars
84- $ this ->text = utf8_decode ($ this ->text );
86+ $ text = utf8_decode ($ this ->text );
87+ $ text = trim (preg_replace ('/[\s\t\n\r\s]+/ ' , ' ' , $ text ));
8588 $ needle = utf8_decode ($ needle );
86-
89+ $ needleLength = strlen ($ needle );
90+ $ found = [];
91+
92+ $ positions = $ this ->concordancePositions ($ text , $ needle , $ contextLength , $ ignorecase , $ position );
93+
94+ // Getting excerpts
95+ foreach ($ positions as $ needlePosition ) {
96+ //marking the term
97+ $ text_marked = ($ mark ) ? Text::markString ($ text , $ needlePosition , $ needleLength , ['{{ ' ,'}} ' ]) : $ text ;
98+ $ needleLength_marked = ($ mark ) ? $ needleLength +4 : $ needleLength ;
99+
100+ $ found [] = utf8_encode (Text::getExcerpt ($ text_marked , $ needlePosition , $ needleLength_marked , $ contextLength ));
101+ }
102+
103+ return $ found ;
104+ }
105+
106+ /**
107+ * Return all positions of the needle in the text according to the position of the needle in a word.
108+ * @param string $text
109+ * @param int $needle
110+ * @param int $contextLength The amount of space left and right of the found needle
111+ * @param bool $ignorecase
112+ * @param int $position. Available options: contain, begin, end, equal.
113+ * @return array
114+ */
115+ public function concordancePositions (string $ text , string $ needle , int $ contextLength = 20 , bool $ ignorecase = true , string $ position = 'contain ' ) : array
116+ {
87117 $ found = [];
88- $ text = ' ' . trim (preg_replace ('/[\s\t\n\r\s]+/ ' , ' ' , $ this ->text )) . ' ' ;
89118 $ needleLength = strlen ($ needle );
90119 $ textLength = strlen ($ text );
91120 $ bufferLength = $ needleLength + 2 * $ contextLength ;
@@ -97,13 +126,13 @@ public function concordance(string $needle, int $contextLength = 20, bool $ignor
97126
98127 switch ($ position ) {
99128 case 'equal ' :
100- $ pattern = "/[^ $ word_part]( $ needle)[^ $ word_part]/ " ;
129+ $ pattern = "/(?<![ $ word_part]) ( $ needle)(?![ $ word_part]) / " ;
101130 break ;
102131 case 'begin ' :
103- $ pattern = "/[^ $ word_part]( $ needle)[ $ special_chars]?[\p{L}]*|^( $ needle)/ " ;
132+ $ pattern = "/(?<![ $ word_part]) ( $ needle)[ $ special_chars]?[\p{L}]*|^( $ needle)/ " ;
104133 break ;
105134 case 'end ' :
106- $ pattern = "/[\p{L}]*[ $ special_chars]?[\p{L}]*( $ needle)[^ $ word_part]/ " ;
135+ $ pattern = "/[\p{L}]*[ $ special_chars]?[\p{L}]*( $ needle)(?![ $ word_part]) / " ;
107136 break ;
108137 case 'contain ' :
109138 $ pattern = "/( $ needle)/ " ;
@@ -115,24 +144,11 @@ public function concordance(string $needle, int $contextLength = 20, bool $ignor
115144
116145 $ case = $ ignorecase ? 'i ' : '' ;
117146 preg_match_all ($ pattern .$ case , $ text , $ matches , PREG_OFFSET_CAPTURE );
147+ $ positions = array_column ($ matches [1 ], 1 );
118148
119- // Getting excerpts
120- foreach ($ matches [1 ] as $ match ) {
121-
122- $ needlePosition = $ match [1 ];
123- $ left = max ($ needlePosition - $ contextLength , 0 );
124-
125- if ($ needleLength + $ contextLength + $ needlePosition > $ textLength ) {
126- $ tmp = substr ($ text , $ left );
127- } else {
128- $ tmp = substr ($ text , $ left , $ bufferLength );
129- }
130- $ found [] = utf8_encode ($ tmp );
131- }
132-
133- return $ found ;
149+ return $ positions ;
134150 }
135-
151+
136152 /**
137153 * Get percentage of times the needle shows up in the text
138154 * @param string $needle
@@ -143,7 +159,7 @@ public function percentage(string $needle) : float
143159 $ freqDist = freq_dist ($ this ->getTokens ());
144160 return $ freqDist ->getKeyValuesByFrequency ()[$ needle ] / $ freqDist ->getTotalTokens ();
145161 }
146-
162+
147163 /**
148164 * Performs a case insensitive search for the needle
149165 * @param string $needle
@@ -153,7 +169,7 @@ public function count(string $needle) : int
153169 {
154170 return substr_count (strtolower ($ this ->getText ()), strtolower ($ needle ));
155171 }
156-
172+
157173 /**
158174 * Return all the position of the needle found in the text
159175 * @param string $needle
@@ -166,7 +182,7 @@ public function findAll(string $needle) : array
166182 $ needle = strtolower ($ needle );
167183 $ text = strtolower ($ this ->getText ());
168184 $ needleLength = strlen ($ needle );
169- while (($ lastPos = stripos ($ text , $ needle , $ lastPos ))!== false )
185+ while (($ lastPos = stripos ($ text , $ needle , $ lastPos ))!== false )
170186 {
171187 $ positions [] = $ lastPos ;
172188 $ lastPos += $ needleLength ;
@@ -177,8 +193,8 @@ public function toString()
177193 {
178194 return $ this ->text ;
179195 }
180-
181- public function __destruct ()
196+
197+ public function __destruct ()
182198 {
183199 unset($ this ->text );
184200 unset($ this ->tokens );
0 commit comments