|
5 | 5 | */ |
6 | 6 | package org.elasticsearch.xpack.core.ml.job.config; |
7 | 7 |
|
8 | | -import org.apache.lucene.analysis.Analyzer; |
9 | | -import org.elasticsearch.Version; |
10 | | -import org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction; |
11 | | -import org.elasticsearch.cluster.metadata.IndexMetaData; |
12 | 8 | import org.elasticsearch.common.ParseField; |
13 | 9 | import org.elasticsearch.common.Strings; |
14 | | -import org.elasticsearch.common.UUIDs; |
15 | | -import org.elasticsearch.common.collect.Tuple; |
16 | 10 | import org.elasticsearch.common.io.stream.StreamInput; |
17 | 11 | import org.elasticsearch.common.io.stream.StreamOutput; |
18 | 12 | import org.elasticsearch.common.io.stream.Writeable; |
|
22 | 16 | import org.elasticsearch.common.xcontent.XContentFactory; |
23 | 17 | import org.elasticsearch.common.xcontent.XContentParser; |
24 | 18 | import org.elasticsearch.common.xcontent.XContentType; |
25 | | -import org.elasticsearch.env.Environment; |
26 | | -import org.elasticsearch.index.IndexSettings; |
27 | | -import org.elasticsearch.index.analysis.AnalysisRegistry; |
28 | | -import org.elasticsearch.index.analysis.CharFilterFactory; |
29 | | -import org.elasticsearch.index.analysis.CustomAnalyzer; |
30 | | -import org.elasticsearch.index.analysis.CustomAnalyzerProvider; |
31 | | -import org.elasticsearch.index.analysis.TokenFilterFactory; |
32 | | -import org.elasticsearch.index.analysis.TokenizerFactory; |
33 | | -import org.elasticsearch.indices.analysis.AnalysisModule; |
34 | 19 | import org.elasticsearch.rest.action.admin.indices.RestAnalyzeAction; |
35 | 20 | import org.elasticsearch.xpack.core.ml.MlParserType; |
36 | 21 |
|
|
42 | 27 | import java.util.Map; |
43 | 28 | import java.util.Objects; |
44 | 29 |
|
45 | | - |
46 | 30 | /** |
47 | 31 | * Configuration for the categorization analyzer. |
48 | 32 | * |
49 | 33 | * The syntax is a subset of what can be supplied to the {@linkplain RestAnalyzeAction <code>_analyze</code> endpoint}. |
50 | | - * To summarise, the first option is to specify the name of an out-of-the-box analyzer: |
| 34 | + * To summarize, the first option is to specify the name of an out-of-the-box analyzer: |
51 | 35 | * <code> |
52 | 36 | * "categorization_analyzer" : "standard" |
53 | 37 | * </code> |
|
66 | 50 | * { "type" : "pattern_replace", "pattern": "^[0-9].*" } |
67 | 51 | * ] |
68 | 52 | * </code> |
69 | | - * |
70 | | - * Unfortunately there is no easy to to reuse a subset of the <code>_analyze</code> action implementation, so much |
71 | | - * of the code in this file is copied from {@link TransportAnalyzeAction}. Unfortunately the logic required here is |
72 | | - * not quite identical to that of {@link TransportAnalyzeAction}, and the required code is hard to partially reuse. |
73 | | - * TODO: consider refactoring ES core to allow more reuse. |
74 | 53 | */ |
75 | 54 | public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeable { |
76 | 55 |
|
@@ -350,175 +329,6 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws |
350 | 329 | return builder; |
351 | 330 | } |
352 | 331 |
|
353 | | - /** |
354 | | - * Convert the config to an {@link Analyzer}. This may be a global analyzer or a newly created custom analyzer. |
355 | | - * In the case of a global analyzer the caller must NOT close it when they have finished with it. In the case of |
356 | | - * a newly created custom analyzer the caller is responsible for closing it. |
357 | | - * @return The first tuple member is the {@link Analyzer}; the second indicates whether the caller is responsible |
358 | | - * for closing it. |
359 | | - */ |
360 | | - public Tuple<Analyzer, Boolean> toAnalyzer(AnalysisRegistry analysisRegistry, Environment environment) throws IOException { |
361 | | - if (analyzer != null) { |
362 | | - Analyzer globalAnalyzer = analysisRegistry.getAnalyzer(analyzer); |
363 | | - if (globalAnalyzer == null) { |
364 | | - throw new IllegalArgumentException("Failed to find global analyzer [" + analyzer + "]"); |
365 | | - } |
366 | | - return new Tuple<>(globalAnalyzer, Boolean.FALSE); |
367 | | - } else { |
368 | | - List<CharFilterFactory> charFilterFactoryList = |
369 | | - parseCharFilterFactories(analysisRegistry, environment); |
370 | | - |
371 | | - Tuple<String, TokenizerFactory> tokenizerFactory = parseTokenizerFactory(analysisRegistry, |
372 | | - environment); |
373 | | - |
374 | | - List<TokenFilterFactory> tokenFilterFactoryList = parseTokenFilterFactories(analysisRegistry, |
375 | | - environment, tokenizerFactory, charFilterFactoryList); |
376 | | - |
377 | | - return new Tuple<>(new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(), |
378 | | - charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]), |
379 | | - tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()])), Boolean.TRUE); |
380 | | - } |
381 | | - } |
382 | | - |
383 | | - |
384 | | - /** |
385 | | - * Get char filter factories for each configured char filter. Each configuration |
386 | | - * element can be the name of an out-of-the-box char filter, or a custom definition. |
387 | | - */ |
388 | | - private List<CharFilterFactory> parseCharFilterFactories(AnalysisRegistry analysisRegistry, |
389 | | - Environment environment) throws IOException { |
390 | | - final List<CharFilterFactory> charFilterFactoryList = new ArrayList<>(); |
391 | | - for (NameOrDefinition charFilter : charFilters) { |
392 | | - final CharFilterFactory charFilterFactory; |
393 | | - if (charFilter.name != null) { |
394 | | - AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory = |
395 | | - analysisRegistry.getCharFilterProvider(charFilter.name); |
396 | | - if (charFilterFactoryFactory == null) { |
397 | | - throw new IllegalArgumentException("Failed to find global char filter under [" + charFilter.name + "]"); |
398 | | - } |
399 | | - charFilterFactory = charFilterFactoryFactory.get(environment, charFilter.name); |
400 | | - } else { |
401 | | - String charFilterTypeName = charFilter.definition.get("type"); |
402 | | - if (charFilterTypeName == null) { |
403 | | - throw new IllegalArgumentException("Missing [type] setting for char filter: " + charFilter.definition); |
404 | | - } |
405 | | - AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory = |
406 | | - analysisRegistry.getCharFilterProvider(charFilterTypeName); |
407 | | - if (charFilterFactoryFactory == null) { |
408 | | - throw new IllegalArgumentException("Failed to find global char filter under [" + charFilterTypeName + "]"); |
409 | | - } |
410 | | - Settings settings = augmentSettings(charFilter.definition); |
411 | | - // Need to set anonymous "name" of char_filter |
412 | | - charFilterFactory = charFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, |
413 | | - "_anonymous_charfilter", settings); |
414 | | - } |
415 | | - if (charFilterFactory == null) { |
416 | | - throw new IllegalArgumentException("Failed to find char filter [" + charFilter + "]"); |
417 | | - } |
418 | | - charFilterFactoryList.add(charFilterFactory); |
419 | | - } |
420 | | - return charFilterFactoryList; |
421 | | - } |
422 | | - |
423 | | - /** |
424 | | - * Get the tokenizer factory for the configured tokenizer. The configuration |
425 | | - * can be the name of an out-of-the-box tokenizer, or a custom definition. |
426 | | - */ |
427 | | - private Tuple<String, TokenizerFactory> parseTokenizerFactory(AnalysisRegistry analysisRegistry, |
428 | | - Environment environment) throws IOException { |
429 | | - final String name; |
430 | | - final TokenizerFactory tokenizerFactory; |
431 | | - if (tokenizer.name != null) { |
432 | | - name = tokenizer.name; |
433 | | - AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(name); |
434 | | - if (tokenizerFactoryFactory == null) { |
435 | | - throw new IllegalArgumentException("Failed to find global tokenizer under [" + name + "]"); |
436 | | - } |
437 | | - tokenizerFactory = tokenizerFactoryFactory.get(environment, name); |
438 | | - } else { |
439 | | - String tokenizerTypeName = tokenizer.definition.get("type"); |
440 | | - if (tokenizerTypeName == null) { |
441 | | - throw new IllegalArgumentException("Missing [type] setting for tokenizer: " + tokenizer.definition); |
442 | | - } |
443 | | - AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory = |
444 | | - analysisRegistry.getTokenizerProvider(tokenizerTypeName); |
445 | | - if (tokenizerFactoryFactory == null) { |
446 | | - throw new IllegalArgumentException("Failed to find global tokenizer under [" + tokenizerTypeName + "]"); |
447 | | - } |
448 | | - Settings settings = augmentSettings(tokenizer.definition); |
449 | | - // Need to set anonymous "name" of tokenizer |
450 | | - name = "_anonymous_tokenizer"; |
451 | | - tokenizerFactory = tokenizerFactoryFactory.get(buildDummyIndexSettings(settings), environment, name, settings); |
452 | | - } |
453 | | - return new Tuple<>(name, tokenizerFactory); |
454 | | - } |
455 | | - |
456 | | - /** |
457 | | - * Get token filter factories for each configured token filter. Each configuration |
458 | | - * element can be the name of an out-of-the-box token filter, or a custom definition. |
459 | | - */ |
460 | | - private List<TokenFilterFactory> parseTokenFilterFactories(AnalysisRegistry analysisRegistry, Environment environment, |
461 | | - Tuple<String, TokenizerFactory> tokenizerFactory, |
462 | | - List<CharFilterFactory> charFilterFactoryList) throws IOException { |
463 | | - final List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>(); |
464 | | - for (NameOrDefinition tokenFilter : tokenFilters) { |
465 | | - TokenFilterFactory tokenFilterFactory; |
466 | | - if (tokenFilter.name != null) { |
467 | | - AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory; |
468 | | - tokenFilterFactoryFactory = analysisRegistry.getTokenFilterProvider(tokenFilter.name); |
469 | | - if (tokenFilterFactoryFactory == null) { |
470 | | - throw new IllegalArgumentException("Failed to find global token filter under [" + tokenFilter.name + "]"); |
471 | | - } |
472 | | - tokenFilterFactory = tokenFilterFactoryFactory.get(environment, tokenFilter.name); |
473 | | - } else { |
474 | | - String filterTypeName = tokenFilter.definition.get("type"); |
475 | | - if (filterTypeName == null) { |
476 | | - throw new IllegalArgumentException("Missing [type] setting for token filter: " + tokenFilter.definition); |
477 | | - } |
478 | | - AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory = |
479 | | - analysisRegistry.getTokenFilterProvider(filterTypeName); |
480 | | - if (tokenFilterFactoryFactory == null) { |
481 | | - throw new IllegalArgumentException("Failed to find global token filter under [" + filterTypeName + "]"); |
482 | | - } |
483 | | - Settings settings = augmentSettings(tokenFilter.definition); |
484 | | - // Need to set anonymous "name" of token_filter |
485 | | - tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, |
486 | | - "_anonymous_tokenfilter", settings); |
487 | | - tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), |
488 | | - tokenizerFactory.v2(), tokenFilterFactoryList, charFilterFactoryList, environment); |
489 | | - } |
490 | | - if (tokenFilterFactory == null) { |
491 | | - throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]"); |
492 | | - } |
493 | | - tokenFilterFactoryList.add(tokenFilterFactory); |
494 | | - } |
495 | | - return tokenFilterFactoryList; |
496 | | - } |
497 | | - |
498 | | - /** |
499 | | - * The Elasticsearch analysis functionality is designed to work with indices. For |
500 | | - * categorization we have to pretend we've got some index settings. |
501 | | - */ |
502 | | - private IndexSettings buildDummyIndexSettings(Settings settings) { |
503 | | - IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build(); |
504 | | - return new IndexSettings(metaData, Settings.EMPTY); |
505 | | - } |
506 | | - |
507 | | - /** |
508 | | - * The behaviour of Elasticsearch analyzers can vary between versions. |
509 | | - * For categorization we'll always use the latest version of the text analysis. |
510 | | - * The other settings are just to stop classes that expect to be associated with |
511 | | - * an index from complaining. |
512 | | - */ |
513 | | - private Settings augmentSettings(Settings settings) { |
514 | | - return Settings.builder().put(settings) |
515 | | - .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) |
516 | | - .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0) |
517 | | - .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) |
518 | | - .put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID()) |
519 | | - .build(); |
520 | | - } |
521 | | - |
522 | 332 | @Override |
523 | 333 | public boolean equals(Object o) { |
524 | 334 | if (this == o) return true; |
@@ -609,17 +419,5 @@ public CategorizationAnalyzerConfig build() { |
609 | 419 | } |
610 | 420 | return new CategorizationAnalyzerConfig(analyzer, charFilters, tokenizer, tokenFilters); |
611 | 421 | } |
612 | | - |
613 | | - /** |
614 | | - * Verify that the builder will build a valid config. This is not done as part of the basic build |
615 | | - * because it verifies that the names of analyzers/tokenizers/filters referenced by the config are |
616 | | - * known, and the validity of these names could change over time. |
617 | | - */ |
618 | | - public void verify(AnalysisRegistry analysisRegistry, Environment environment) throws IOException { |
619 | | - Tuple<Analyzer, Boolean> tuple = build().toAnalyzer(analysisRegistry, environment); |
620 | | - if (tuple.v2()) { |
621 | | - tuple.v1().close(); |
622 | | - } |
623 | | - } |
624 | 422 | } |
625 | 423 | } |
0 commit comments