god-eye/internal/discovery/patterns.go

package discovery

import (
	"regexp"
	"sort"
	"strings"
	"sync"
)

// PatternLearner learns naming patterns from discovered subdomains
type PatternLearner struct {
	mu sync.RWMutex

	// Learned components
	prefixes    map[string]int // prefix -> count
	suffixes    map[string]int // suffix -> count
	separators  map[string]int // separator chars -> count
	words       map[string]int // common words -> count
	numbers     map[string]int // number patterns -> count
	environments map[string]int // env indicators -> count

	// Regex patterns for extraction
	numberPattern *regexp.Regexp
	envPattern    *regexp.Regexp
}

// NewPatternLearner creates a new pattern learner
func NewPatternLearner() *PatternLearner {
	return &PatternLearner{
		prefixes:     make(map[string]int),
		suffixes:     make(map[string]int),
		separators:   make(map[string]int),
		words:        make(map[string]int),
		numbers:      make(map[string]int),
		environments: make(map[string]int),
		numberPattern: regexp.MustCompile(`\d+`),
		envPattern:    regexp.MustCompile(`(?i)(dev|test|stage|staging|prod|production|qa|uat|demo|sandbox|beta|alpha|preview|canary)`),
	}
}

// Learn extracts patterns from a subdomain
func (pl *PatternLearner) Learn(subdomain, domain string) {
	// Extract subdomain part
	subPart := strings.TrimSuffix(subdomain, "."+domain)
	if subPart == subdomain || subPart == "" {
		return
	}

	pl.mu.Lock()
	defer pl.mu.Unlock()

	// Split by common separators
	parts := splitByAny(subPart, ".-_")

	// Learn separators used
	for _, sep := range []string{".", "-", "_"} {
		if strings.Contains(subPart, sep) {
			pl.separators[sep]++
		}
	}

	// Learn each part
	for i, part := range parts {
		part = strings.ToLower(part)
		if part == "" {
			continue
		}

		// Track words
		pl.words[part]++

		// First part is typically a prefix
		if i == 0 && len(parts) > 1 {
			pl.prefixes[part]++
		}

		// Last part before domain is often significant
		if i == len(parts)-1 {
			pl.suffixes[part]++
		}

		// Learn number patterns
		if pl.numberPattern.MatchString(part) {
			// Extract just the number pattern style
			numbers := pl.numberPattern.FindAllString(part, -1)
			for _, num := range numbers {
				if len(num) <= 4 { // Reasonable number length
					pl.numbers[num]++
				}
			}
		}

		// Learn environment indicators
		if pl.envPattern.MatchString(part) {
			env := pl.envPattern.FindString(part)
			pl.environments[strings.ToLower(env)]++
		}
	}
}

// GetLearnedPrefixes returns learned prefixes sorted by frequency
func (pl *PatternLearner) GetLearnedPrefixes() []string {
	pl.mu.RLock()
	defer pl.mu.RUnlock()

	return pl.getTopN(pl.prefixes, 20)
}

// GetLearnedSuffixes returns learned suffixes sorted by frequency
func (pl *PatternLearner) GetLearnedSuffixes() []string {
	pl.mu.RLock()
	defer pl.mu.RUnlock()

	return pl.getTopN(pl.suffixes, 20)
}

// GetLearnedWords returns learned words sorted by frequency
func (pl *PatternLearner) GetLearnedWords() []string {
	pl.mu.RLock()
	defer pl.mu.RUnlock()

	return pl.getTopN(pl.words, 50)
}

// GetEnvironments returns detected environment indicators
func (pl *PatternLearner) GetEnvironments() []string {
	pl.mu.RLock()
	defer pl.mu.RUnlock()

	return pl.getTopN(pl.environments, 10)
}

// GenerateSmartWordlist generates a wordlist based on learned patterns
func (pl *PatternLearner) GenerateSmartWordlist(baseWordlist []string) []string {
	pl.mu.RLock()
	defer pl.mu.RUnlock()

	seen := make(map[string]bool)
	var result []string

	// Add base wordlist
	for _, word := range baseWordlist {
		if !seen[word] {
			seen[word] = true
			result = append(result, word)
		}
	}

	// Get learned components
	learnedWords := pl.getTopN(pl.words, 30)
	learnedEnvs := pl.getTopN(pl.environments, 5)
	learnedNumbers := pl.getTopN(pl.numbers, 10)

	// Detect preferred separator
	separator := "-"
	maxSep := 0
	for sep, count := range pl.separators {
		if count > maxSep && sep != "." {
			separator = sep
			maxSep = count
		}
	}

	// Generate combinations
	for _, word := range learnedWords {
		// Word alone
		if !seen[word] {
			seen[word] = true
			result = append(result, word)
		}

		// Word + number
		for _, num := range learnedNumbers {
			combo := word + num
			if !seen[combo] {
				seen[combo] = true
				result = append(result, combo)
			}
			combo = word + separator + num
			if !seen[combo] {
				seen[combo] = true
				result = append(result, combo)
			}
		}

		// Word + environment
		for _, env := range learnedEnvs {
			combo := word + separator + env
			if !seen[combo] {
				seen[combo] = true
				result = append(result, combo)
			}
			combo = env + separator + word
			if !seen[combo] {
				seen[combo] = true
				result = append(result, combo)
			}
		}
	}

	// Environment permutations
	for _, env := range learnedEnvs {
		for _, num := range learnedNumbers {
			combo := env + num
			if !seen[combo] {
				seen[combo] = true
				result = append(result, combo)
			}
			combo = env + separator + num
			if !seen[combo] {
				seen[combo] = true
				result = append(result, combo)
			}
		}
	}

	return result
}

// GeneratePermutations generates permutations for a specific subdomain
func (pl *PatternLearner) GeneratePermutations(subdomain, domain string) []string {
	subPart := strings.TrimSuffix(subdomain, "."+domain)
	if subPart == subdomain || subPart == "" {
		return nil
	}

	pl.mu.RLock()
	defer pl.mu.RUnlock()

	seen := make(map[string]bool)
	var results []string

	parts := splitByAny(subPart, ".-_")
	if len(parts) == 0 {
		return nil
	}

	// Detect separator used
	separator := "-"
	if strings.Contains(subPart, "-") {
		separator = "-"
	} else if strings.Contains(subPart, "_") {
		separator = "_"
	}

	basePart := parts[0]
	learnedEnvs := pl.getTopN(pl.environments, 5)
	learnedNumbers := pl.getTopN(pl.numbers, 5)

	// Generate variations
	// base -> base-dev, base-staging, etc.
	for _, env := range learnedEnvs {
		perm := basePart + separator + env + "." + domain
		if !seen[perm] {
			seen[perm] = true
			results = append(results, perm)
		}
		perm = env + separator + basePart + "." + domain
		if !seen[perm] {
			seen[perm] = true
			results = append(results, perm)
		}
	}

	// base -> base1, base2, base-01, etc.
	for _, num := range learnedNumbers {
		perm := basePart + num + "." + domain
		if !seen[perm] {
			seen[perm] = true
			results = append(results, perm)
		}
		perm = basePart + separator + num + "." + domain
		if !seen[perm] {
			seen[perm] = true
			results = append(results, perm)
		}
	}

	// If multi-part, try variations of inner parts
	if len(parts) > 1 {
		for _, env := range learnedEnvs {
			// api.example.com -> api-dev.example.com
			perm := basePart + separator + env + "." + strings.Join(parts[1:], ".") + "." + domain
			if !seen[perm] {
				seen[perm] = true
				results = append(results, perm)
			}
		}
	}

	return results
}

// getTopN returns top N items from a frequency map
func (pl *PatternLearner) getTopN(m map[string]int, n int) []string {
	type kv struct {
		Key   string
		Value int
	}

	var sorted []kv
	for k, v := range m {
		sorted = append(sorted, kv{k, v})
	}

	sort.Slice(sorted, func(i, j int) bool {
		return sorted[i].Value > sorted[j].Value
	})

	var result []string
	for i := 0; i < n && i < len(sorted); i++ {
		result = append(result, sorted[i].Key)
	}
	return result
}

// Stats returns statistics about learned patterns
type PatternStats struct {
	UniquePrefixes    int
	UniqueSuffixes    int
	UniqueWords       int
	UniqueNumbers     int
	Environments      []string
	PreferredSeparator string
}

// GetStats returns pattern statistics
func (pl *PatternLearner) GetStats() PatternStats {
	pl.mu.RLock()
	defer pl.mu.RUnlock()

	// Find preferred separator
	separator := "."
	maxCount := 0
	for sep, count := range pl.separators {
		if count > maxCount {
			separator = sep
			maxCount = count
		}
	}

	return PatternStats{
		UniquePrefixes:     len(pl.prefixes),
		UniqueSuffixes:     len(pl.suffixes),
		UniqueWords:        len(pl.words),
		UniqueNumbers:      len(pl.numbers),
		Environments:       pl.getTopN(pl.environments, 10),
		PreferredSeparator: separator,
	}
}

// splitByAny splits a string by any of the given separators
func splitByAny(s string, seps string) []string {
	splitter := func(r rune) bool {
		return strings.ContainsRune(seps, r)
	}
	return strings.FieldsFunc(s, splitter)
}