diff --git a/pkg/analyzer/analyzer.go b/pkg/analyzer/analyzer.go index b043e176a58..a5251060d90 100644 --- a/pkg/analyzer/analyzer.go +++ b/pkg/analyzer/analyzer.go @@ -155,6 +155,13 @@ type analyzerInfo struct { fallbackMinifiedFileLOC int } +// fileTypeInfo contains file path, detected platform type, and LOC count +type fileTypeInfo struct { + filePath string + fileType string + locCount int +} + // Analyzer keeps all the relevant info for the function Analyze type Analyzer struct { Paths []string @@ -318,6 +325,7 @@ func Analyze(a *Analyzer) (model.AnalyzedPaths, error) { Types: make([]string, 0), Exc: make([]string, 0), ExpectedLOC: 0, + FileStats: make(map[string]model.FileStatistics), } var files []string @@ -325,6 +333,7 @@ func Analyze(a *Analyzer) (model.AnalyzedPaths, error) { // results is the channel shared by the workers that contains the types found results := make(chan string) locCount := make(chan int) + fileInfo := make(chan fileTypeInfo) ignoreFiles := make([]string, 0) projectConfigFiles := make([]string, 0) done := make(chan bool) @@ -374,7 +383,7 @@ func Analyze(a *Analyzer) (model.AnalyzedPaths, error) { filePath: file, fallbackMinifiedFileLOC: a.FallbackMinifiedFileLOC, } - go a.worker(results, unwanted, locCount, &wg) + go a.worker(results, unwanted, locCount, fileInfo, &wg) } go func() { @@ -383,27 +392,35 @@ func Analyze(a *Analyzer) (model.AnalyzedPaths, error) { close(unwanted) close(results) close(locCount) + close(fileInfo) }() wg.Wait() done <- true }() - availableTypes, unwantedPaths, loc := computeValues(results, unwanted, locCount, done) + availableTypes, unwantedPaths, loc, fileStats := computeValues(results, unwanted, locCount, fileInfo, done) multiPlatformTypeCheck(&availableTypes) unwantedPaths = append(unwantedPaths, ignoreFiles...) unwantedPaths = append(unwantedPaths, projectConfigFiles...) returnAnalyzedPaths.Types = availableTypes returnAnalyzedPaths.Exc = unwantedPaths returnAnalyzedPaths.ExpectedLOC = loc + returnAnalyzedPaths.FileStats = fileStats // stop metrics for file analyzer metrics.Metric.Stop() return returnAnalyzedPaths, nil } // worker determines the type of the file by ext (dockerfile and terraform)/content and -// writes the answer to the results channel +// writes the answer to the results channel and file info for statistics // if no types were found, the worker will write the path of the file in the unwanted channel -func (a *analyzerInfo) worker(results, unwanted chan<- string, locCount chan<- int, wg *sync.WaitGroup) { //nolint: gocyclo +func (a *analyzerInfo) worker( //nolint: gocyclo + results, + unwanted chan<- string, + locCount chan<- int, + fileInfo chan<- fileTypeInfo, + wg *sync.WaitGroup, +) { defer func() { if err := recover(); err != nil { log.Warn().Msgf("Recovered from analyzing panic for file %s with error: %#v", a.filePath, err.(error).Error()) @@ -422,12 +439,14 @@ func (a *analyzerInfo) worker(results, unwanted chan<- string, locCount chan<- i if a.isAvailableType(dockerfile) { results <- dockerfile locCount <- linesCount + fileInfo <- fileTypeInfo{filePath: a.filePath, fileType: dockerfile, locCount: linesCount} } // Dockerfile (indirect identification) case "possibleDockerfile", ".ubi8", ".debian": if a.isAvailableType(dockerfile) && isDockerfile(a.filePath) { results <- dockerfile locCount <- linesCount + fileInfo <- fileTypeInfo{filePath: a.filePath, fileType: dockerfile, locCount: linesCount} } else { unwanted <- a.filePath } @@ -436,30 +455,34 @@ func (a *analyzerInfo) worker(results, unwanted chan<- string, locCount chan<- i if a.isAvailableType(terraform) { results <- terraform locCount <- linesCount + fileInfo <- fileTypeInfo{filePath: a.filePath, fileType: terraform, locCount: linesCount} } // Bicep case ".bicep": if a.isAvailableType(bicep) { results <- arm locCount <- linesCount + fileInfo <- fileTypeInfo{filePath: a.filePath, fileType: arm, locCount: linesCount} } // GRPC case ".proto": if a.isAvailableType(grpc) { results <- grpc locCount <- linesCount + fileInfo <- fileTypeInfo{filePath: a.filePath, fileType: grpc, locCount: linesCount} } // It could be Ansible Config or Ansible Inventory case ".cfg", ".conf", ".ini": if a.isAvailableType(ansible) { results <- ansible locCount <- linesCount + fileInfo <- fileTypeInfo{filePath: a.filePath, fileType: ansible, locCount: linesCount} } /* It could be Ansible, Buildah, CICD, CloudFormation, Crossplane, OpenAPI, Azure Resource Manager Docker Compose, Knative, Kubernetes, Pulumi, ServerlessFW or Google Deployment Manager. We also have FHIR's case which will be ignored since it's not a platform file.*/ case yaml, yml, json, sh: - a.checkContent(results, unwanted, locCount, linesCount, ext) + a.checkContent(results, unwanted, locCount, fileInfo, linesCount, ext) } } } @@ -500,7 +523,14 @@ func needsOverride(check bool, returnType, key, ext string) bool { // checkContent will determine the file type by content when worker was unable to // determine by ext, if no type was determined checkContent adds it to unwanted channel -func (a *analyzerInfo) checkContent(results, unwanted chan<- string, locCount chan<- int, linesCount int, ext string) { +func (a *analyzerInfo) checkContent( + results, + unwanted chan<- string, + locCount chan<- int, + fileInfo chan<- fileTypeInfo, + linesCount int, + ext string, +) { typesFlag := a.typesFlag excludeTypesFlag := a.excludeTypesFlag // get file content @@ -558,6 +588,7 @@ func (a *analyzerInfo) checkContent(results, unwanted chan<- string, locCount ch results <- returnType locCount <- linesCount + fileInfo <- fileTypeInfo{filePath: a.filePath, fileType: returnType, locCount: linesCount} } func checkReturnType(path, returnType, ext string, content []byte) string { @@ -661,10 +692,21 @@ func checkForAnsibleHost(yamlContent model.Document) bool { // computeValues computes expected Lines of Code to be scanned from locCount channel // and creates the types and unwanted slices from the channels removing any duplicates -func computeValues(types, unwanted chan string, locCount chan int, done chan bool) (typesS, unwantedS []string, locTotal int) { +// also collects file statistics for memory calculation +func computeValues( + types, + unwanted chan string, + locCount chan int, + fileInfo chan fileTypeInfo, + done chan bool, +) (typesS, unwantedS []string, locTotal int, stats map[string]model.FileStatistics) { var val int unwantedSlice := make([]string, 0) typeSlice := make([]string, 0) + stats = make(map[string]model.FileStatistics) + + platformFilesInfo := make(map[string][]fileTypeInfo) + for { select { case i := <-locCount: @@ -677,8 +719,28 @@ func computeValues(types, unwanted chan string, locCount chan int, done chan boo if !utils.Contains(i, typeSlice) { typeSlice = append(typeSlice, i) } + case info := <-fileInfo: + platformFilesInfo[info.fileType] = append(platformFilesInfo[info.fileType], info) case <-done: - return typeSlice, unwantedSlice, val + for platformType, filesInfo := range platformFilesInfo { + dirMap := make(map[string]int) + totalLOC := 0 + + for _, fileInfo := range filesInfo { + dir := filepath.Dir(fileInfo.filePath) + dirMap[dir]++ + totalLOC += fileInfo.locCount + } + + stats[platformType] = model.FileStatistics{ + FileCount: len(filesInfo), + DirectoryCount: len(dirMap), + FilesByDir: dirMap, + TotalLOC: totalLOC, + } + } + + return typeSlice, unwantedSlice, val, stats } } } diff --git a/pkg/analyzer/analyzer_test.go b/pkg/analyzer/analyzer_test.go index 18f14bb0f23..de08e1381d1 100644 --- a/pkg/analyzer/analyzer_test.go +++ b/pkg/analyzer/analyzer_test.go @@ -588,3 +588,132 @@ func TestAnalyzer_Analyze(t *testing.T) { }) } } + +func TestAnalyzer_FileStats(t *testing.T) { + tests := []struct { + name string + paths []string + typesFromFlag []string + excludeTypesFromFlag []string + wantPlatformStats map[string]platformFileStats + gitIgnoreFileName string + excludeGitIgnore bool + MaxFileSize int + }{ + { + name: "file_stats_nested_structure_with_multiple_platforms", + paths: []string{filepath.FromSlash("../../test/fixtures/analyzer_test/helm")}, + typesFromFlag: []string{""}, + excludeTypesFromFlag: []string{""}, + wantPlatformStats: map[string]platformFileStats{ + "kubernetes": { + fileCount: 3, + dirCount: 2, + totalLOC: 118, + }, + }, + gitIgnoreFileName: "", + excludeGitIgnore: true, + MaxFileSize: -1, + }, + { + name: "file_stats_multiple_platforms_nested_directories", + paths: []string{filepath.FromSlash("../../test/fixtures/analyzer_test")}, + typesFromFlag: []string{""}, + excludeTypesFromFlag: []string{""}, + wantPlatformStats: map[string]platformFileStats{ + "terraform": { + fileCount: 1, + dirCount: 1, + totalLOC: 10, + }, + "kubernetes": { + fileCount: 4, + dirCount: 3, + totalLOC: 131, + }, + "dockerfile": { + fileCount: 1, + dirCount: 1, + totalLOC: 3, + }, + }, + gitIgnoreFileName: "", + excludeGitIgnore: true, + MaxFileSize: -1, + }, + { + name: "file_stats_with_type_filter", + paths: []string{filepath.FromSlash("../../test/fixtures/analyzer_test")}, + typesFromFlag: []string{"terraform", "kubernetes"}, + excludeTypesFromFlag: []string{""}, + wantPlatformStats: map[string]platformFileStats{ + "terraform": { + fileCount: 1, + dirCount: 1, + totalLOC: 10, + }, + "kubernetes": { + fileCount: 6, + dirCount: 3, + totalLOC: 156, + }, + }, + gitIgnoreFileName: "", + excludeGitIgnore: true, + MaxFileSize: -1, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + exc := []string{""} + + analyzer := &Analyzer{ + Paths: tt.paths, + Types: tt.typesFromFlag, + ExcludeTypes: tt.excludeTypesFromFlag, + Exc: exc, + ExcludeGitIgnore: tt.excludeGitIgnore, + GitIgnoreFileName: tt.gitIgnoreFileName, + MaxFileSize: tt.MaxFileSize, + } + + got, err := Analyze(analyzer) + require.NoError(t, err) + + require.NotNil(t, got.FileStats, "FileStats should not be nil") + + for platform, expectedStats := range tt.wantPlatformStats { + platformStats, exists := got.FileStats[platform] + require.True(t, exists, "FileStats should contain platform: %s", platform) + + require.Equal(t, expectedStats.fileCount, platformStats.FileCount, + "wrong file count for platform %s", platform) + + require.Equal(t, expectedStats.dirCount, platformStats.DirectoryCount, + "wrong directory count for platform %s", platform) + + require.Equal(t, expectedStats.totalLOC, platformStats.TotalLOC, + "wrong total LOC for platform %s", platform) + + require.NotNil(t, platformStats.FilesByDir, "FilesByDir should not be nil") + require.Equal(t, expectedStats.dirCount, len(platformStats.FilesByDir), + "wrong FilesByDir entries for platform %s", platform) + + totalFilesFromDirs := 0 + for _, fileCount := range platformStats.FilesByDir { + totalFilesFromDirs += fileCount + } + require.Equal(t, platformStats.FileCount, totalFilesFromDirs, + "file count sum mismatch for platform %s", platform) + } + }) + } +} + +type platformFileStats struct { + fileCount int + dirCount int + totalLOC int +} diff --git a/pkg/kics/sink.go b/pkg/kics/sink.go index 2bc48c4c84e..ce7e054f26c 100644 --- a/pkg/kics/sink.go +++ b/pkg/kics/sink.go @@ -8,14 +8,15 @@ import ( "regexp" "sort" - sentryReport "github.com/Checkmarx/kics/v2/internal/sentry" - "github.com/Checkmarx/kics/v2/pkg/model" - "github.com/Checkmarx/kics/v2/pkg/parser/jsonfilter/parser" - "github.com/Checkmarx/kics/v2/pkg/utils" "github.com/antlr4-go/antlr/v4" "github.com/google/uuid" "github.com/pkg/errors" "github.com/rs/zerolog/log" + + sentryReport "github.com/Checkmarx/kics/v2/internal/sentry" + "github.com/Checkmarx/kics/v2/pkg/model" + "github.com/Checkmarx/kics/v2/pkg/parser/jsonfilter/parser" + "github.com/Checkmarx/kics/v2/pkg/utils" ) var ( diff --git a/pkg/model/model.go b/pkg/model/model.go index 0a8225337b9..a2f2bc88547 100644 --- a/pkg/model/model.go +++ b/pkg/model/model.go @@ -293,6 +293,15 @@ type AnalyzedPaths struct { Types []string Exc []string ExpectedLOC int + FileStats map[string]FileStatistics +} + +// FileStatistics contains file and directory counts per platform type +type FileStatistics struct { + FileCount int + DirectoryCount int + FilesByDir map[string]int + TotalLOC int } // ResolvedFileSplit is a struct that contains the information of a resolved file, the path and the lines of the file diff --git a/pkg/parser/terraform/variables.go b/pkg/parser/terraform/variables.go index ae20712d5c5..05f6ee683ba 100644 --- a/pkg/parser/terraform/variables.go +++ b/pkg/parser/terraform/variables.go @@ -6,16 +6,24 @@ import ( "path/filepath" "regexp" "strings" + "sync" - "github.com/Checkmarx/kics/v2/pkg/parser/terraform/converter" "github.com/hashicorp/hcl/v2" "github.com/hashicorp/hcl/v2/hclsyntax" "github.com/rs/zerolog/log" "github.com/zclconf/go-cty/cty" + + "github.com/Checkmarx/kics/v2/pkg/parser/terraform/converter" ) var inputVariableMap = make(converter.VariableMap) +// Cache for directory-level variable resolution +var ( + variableCache = make(map[string]converter.VariableMap) + variableCacheMutex sync.RWMutex +) + func mergeMaps(baseMap, newItems converter.VariableMap) { for key, value := range newItems { baseMap[key] = value @@ -87,12 +95,18 @@ func getInputVariablesFromFile(filename string) (converter.VariableMap, error) { return variables, nil } -func getInputVariables(currentPath, fileContent, terraformVarsPath string) { +// buildVariablesForDirectory scans a directory once and builds the complete variable map +func buildVariablesForDirectory(currentPath, terraformVarsPath string) (converter.VariableMap, error) { variablesMap := make(converter.VariableMap) + + // Get all .tf files tfFiles, err := filepath.Glob(filepath.Join(currentPath, "*.tf")) if err != nil { log.Error().Msg("Error getting .tf files") + return variablesMap, err } + + // Process all .tf files for default values for _, tfFile := range tfFiles { variables, errDefaultValues := setInputVariablesDefaultValues(tfFile) if errDefaultValues != nil { @@ -124,6 +138,28 @@ func getInputVariables(currentPath, fileContent, terraformVarsPath string) { mergeMaps(variablesMap, variables) } + // Process custom terraform vars path if provided + if terraformVarsPath != "" { + _, err = os.Stat(terraformVarsPath) + if err != nil { + log.Trace().Msgf("%s file not found", terraformVarsPath) + } else { + variables, errInputVariables := getInputVariablesFromFile(terraformVarsPath) + if errInputVariables != nil { + log.Error().Msgf("Error getting values from %s", terraformVarsPath) + log.Err(errInputVariables) + } else { + mergeMaps(variablesMap, variables) + } + } + } + + return variablesMap, nil +} + +// getInputVariables now uses caching to avoid rescanning directories +func getInputVariables(currentPath, fileContent, terraformVarsPath string) { + // Extract terraform vars path from file content if not provided // If the flag is empty let's look for the value in the first written line of the file if terraformVarsPath == "" { terraformVarsPathRegex := regexp.MustCompile(`(?m)^\s*// kics_terraform_vars: ([\w/\\.:-]+)\r?\n`) @@ -131,7 +167,7 @@ func getInputVariables(currentPath, fileContent, terraformVarsPath string) { if terraformVarsPathMatch != nil { // There is a path tp the variables file in the file so that will be the path to the variables tf file terraformVarsPath = terraformVarsPathMatch[1] - // If the path contains ":" assume its a global path + // If the path contains ":" assume it's a global path if !strings.Contains(terraformVarsPath, ":") { // If not then add the current folder path before so that the comment path can be relative terraformVarsPath = filepath.Join(currentPath, terraformVarsPath) @@ -139,22 +175,32 @@ func getInputVariables(currentPath, fileContent, terraformVarsPath string) { } } - // If the terraformVarsPath is empty, this means that it is not in the flag - // and it is not in the first written line of the file + // Create a cache key that includes the custom vars path if provided + cacheKey := currentPath if terraformVarsPath != "" { - _, err = os.Stat(terraformVarsPath) - if err != nil { - log.Trace().Msgf("%s file not found", terraformVarsPath) - } else { - variables, errInputVariables := getInputVariablesFromFile(terraformVarsPath) - if errInputVariables != nil { - log.Error().Msgf("Error getting values from %s", terraformVarsPath) - log.Err(errInputVariables) - } else { - mergeMaps(variablesMap, variables) - } - } + cacheKey = currentPath + "|" + terraformVarsPath } + // Try to get from cache first + variableCacheMutex.RLock() + if cachedVars, exists := variableCache[cacheKey]; exists { + variableCacheMutex.RUnlock() + inputVariableMap["var"] = cty.ObjectVal(cachedVars) + return + } + variableCacheMutex.RUnlock() + + // Cache miss - build variables for this directory + variablesMap, err := buildVariablesForDirectory(currentPath, terraformVarsPath) + if err != nil { + log.Error().Msgf("Error building variables for directory %s: %v", currentPath, err) + return + } + + // Store in cache + variableCacheMutex.Lock() + variableCache[cacheKey] = variablesMap + variableCacheMutex.Unlock() + inputVariableMap["var"] = cty.ObjectVal(variablesMap) } diff --git a/pkg/parser/terraform/variables_test.go b/pkg/parser/terraform/variables_test.go index 152ac28ab7e..7d604970ee8 100644 --- a/pkg/parser/terraform/variables_test.go +++ b/pkg/parser/terraform/variables_test.go @@ -204,5 +204,135 @@ func TestGetInputVariables(t *testing.T) { } t.Cleanup(func() { inputVariableMap = make(converter.VariableMap) + variableCache = make(map[string]converter.VariableMap) + }) +} + +func TestBuildVariablesForDirectory(t *testing.T) { + tests := []struct { + name string + currentPath string + terraformVarsPath string + want converter.VariableMap + wantErr bool + }{ + { + name: "Should build variables from directory with multiple tf files", + currentPath: filepath.Join("..", "..", "..", "test", "fixtures", "test_terraform_variables"), + terraformVarsPath: "", + want: converter.VariableMap{ + "test1": cty.BoolVal(false), + "test2": cty.TupleVal([]cty.Value{cty.BoolVal(false), cty.BoolVal(true)}), + "map1": cty.ObjectVal(map[string]cty.Value{ + "map1key1": cty.StringVal("map2Key1"), + }), + "map2": cty.ObjectVal(map[string]cty.Value{ + "map2Key1": cty.StringVal("nestedMap"), + }), + "test_terraform": cty.StringVal("terraform.tfvars"), + "default_var_file": cty.StringVal("default_var_file"), + "local_default_var": cty.StringVal("local_default"), + }, + wantErr: false, + }, + { + name: "Should build variables with custom vars path", + currentPath: filepath.Join("..", "..", "..", "test", "fixtures", "test_terraform_variables"), + terraformVarsPath: filepath.Join("..", "..", "..", "test", "fixtures", "test_terraform_variables", "varsToUse", "varsToUse.tf"), + want: converter.VariableMap{ + "test1": cty.BoolVal(false), + "test2": cty.TupleVal([]cty.Value{cty.BoolVal(false), cty.BoolVal(true)}), + "map1": cty.ObjectVal(map[string]cty.Value{ + "map1key1": cty.StringVal("map2Key1"), + }), + "map2": cty.ObjectVal(map[string]cty.Value{ + "map2Key1": cty.StringVal("nestedMap"), + }), + "map3": cty.ObjectVal(map[string]cty.Value{ + "map3Key1": cty.StringVal("givenByVar"), + }), + "test_terraform": cty.StringVal("terraform.tfvars"), + "default_var_file": cty.StringVal("default_var_file"), + "local_default_var": cty.StringVal("local_default"), + }, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := buildVariablesForDirectory(tt.currentPath, tt.terraformVarsPath) + if tt.wantErr { + require.NotNil(t, err) + } else { + require.NoError(t, err) + require.Equal(t, tt.want, got) + } + }) + } + + t.Cleanup(func() { + variableCache = make(map[string]converter.VariableMap) + }) +} + +func TestGetInputVariables_Caching(t *testing.T) { + tests := []struct { + name string + firstCallPath string + secondCallPath string + shouldUseCache bool + expectedVarCount int + }{ + { + name: "Should use cache for same directory", + firstCallPath: filepath.Join("..", "..", "..", "test", "fixtures", "test_terraform_variables"), + secondCallPath: filepath.Join("..", "..", "..", "test", "fixtures", "test_terraform_variables"), + shouldUseCache: true, + expectedVarCount: 7, + }, + { + name: "Should not use cache for different directory", + firstCallPath: filepath.Join("..", "..", "..", "test", "fixtures", "test_terraform_variables"), + secondCallPath: filepath.Join("..", "..", "..", "test", "fixtures", "test_terraform_variables", "varsToUse"), + shouldUseCache: false, + expectedVarCount: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + variableCache = make(map[string]converter.VariableMap) + inputVariableMap = make(converter.VariableMap) + + fileContent1, err := os.ReadFile(filepath.Join(tt.firstCallPath, "test.tf")) + require.NoError(t, err) + getInputVariables(tt.firstCallPath, string(fileContent1), "") + + cacheSize := len(variableCache) + require.Equal(t, 1, cacheSize, "cache should have one entry after first call") + + fileContent2, err := os.ReadFile(filepath.Join(tt.secondCallPath, "test.tf")) + if err != nil { + fileContent2, err = os.ReadFile(filepath.Join(tt.secondCallPath, "varsToUse.tf")) + } + require.NoError(t, err) + getInputVariables(tt.secondCallPath, string(fileContent2), "") + + if tt.shouldUseCache { + require.Equal(t, cacheSize, len(variableCache), "cache size should not change when reusing cache") + } else { + require.Equal(t, cacheSize+1, len(variableCache), "cache should grow for new directory") + } + + varObj, ok := inputVariableMap["var"] + require.True(t, ok, "inputVariableMap should contain 'var' key") + require.Equal(t, tt.expectedVarCount, len(varObj.AsValueMap()), "wrong number of variables") + }) + } + + t.Cleanup(func() { + inputVariableMap = make(converter.VariableMap) + variableCache = make(map[string]converter.VariableMap) }) }