Files
CyberStrikeAI/internal/knowledge/index_pipeline.go
T
2026-04-19 01:17:09 +08:00

92 lines
2.3 KiB
Go

package knowledge
import (
"context"
"database/sql"
"fmt"
"strings"
"cyberstrike-ai/internal/config"
"github.com/cloudwego/eino/compose"
"github.com/cloudwego/eino/components/document"
"github.com/cloudwego/eino/schema"
)
// normalizeChunkStrategy returns "recursive" or "markdown_then_recursive".
func normalizeChunkStrategy(s string) string {
v := strings.TrimSpace(strings.ToLower(s))
switch v {
case "recursive":
return "recursive"
case "markdown_then_recursive", "markdown_recursive", "markdown":
return "markdown_then_recursive"
case "":
return "markdown_then_recursive"
default:
return "markdown_then_recursive"
}
}
func buildKnowledgeIndexChain(
ctx context.Context,
indexingCfg *config.IndexingConfig,
db *sql.DB,
recursive document.Transformer,
embeddingModel string,
) (compose.Runnable[[]*schema.Document, []string], error) {
if recursive == nil {
return nil, fmt.Errorf("recursive transformer is nil")
}
if db == nil {
return nil, fmt.Errorf("db is nil")
}
strategy := normalizeChunkStrategy("markdown_then_recursive")
batch := 64
maxChunks := 0
if indexingCfg != nil {
strategy = normalizeChunkStrategy(indexingCfg.ChunkStrategy)
if indexingCfg.BatchSize > 0 {
batch = indexingCfg.BatchSize
}
maxChunks = indexingCfg.MaxChunksPerItem
}
si := NewSQLiteIndexer(db, batch, embeddingModel)
ch := compose.NewChain[[]*schema.Document, []string]()
if strategy != "recursive" {
md, err := newMarkdownHeaderSplitter(ctx)
if err != nil {
return nil, fmt.Errorf("markdown splitter: %w", err)
}
ch.AppendDocumentTransformer(md)
}
ch.AppendDocumentTransformer(recursive)
ch.AppendLambda(newChunkEnrichLambda(maxChunks))
ch.AppendIndexer(si)
return ch.Compile(ctx)
}
func newChunkEnrichLambda(maxChunks int) *compose.Lambda {
return compose.InvokableLambda(func(ctx context.Context, docs []*schema.Document) ([]*schema.Document, error) {
_ = ctx
out := make([]*schema.Document, 0, len(docs))
for _, d := range docs {
if d == nil || strings.TrimSpace(d.Content) == "" {
continue
}
out = append(out, d)
}
if maxChunks > 0 && len(out) > maxChunks {
out = out[:maxChunks]
}
for i, d := range out {
if d.MetaData == nil {
d.MetaData = make(map[string]any)
}
d.MetaData[metaKBChunkIndex] = i
}
return out, nil
})
}