diff --git a/arazzo_test.go b/arazzo_test.go index 8b7b1e22..54c97093 100644 --- a/arazzo_test.go +++ b/arazzo_test.go @@ -4,12 +4,21 @@ package libopenapi import ( + "reflect" + "sync" "testing" + "unsafe" + "github.com/pb33f/libopenapi/datamodel/low" + lowArazzo "github.com/pb33f/libopenapi/datamodel/low/arazzo" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.yaml.in/yaml/v4" ) +//go:linkname arazzoLowBuildModelFieldCache github.com/pb33f/libopenapi/datamodel/low.buildModelFieldCache +var arazzoLowBuildModelFieldCache sync.Map + func TestNewArazzoDocument_ValidFull(t *testing.T) { yml := []byte(`arazzo: 1.0.1 info: @@ -177,6 +186,67 @@ func TestNewArazzoDocument_ArrayYAML(t *testing.T) { assert.Contains(t, err.Error(), "expected YAML mapping") } +func TestNewArazzoDocument_BuildModelError(t *testing.T) { + yml := []byte(`arazzo: 1.0.1 +`) + var root yaml.Node + require.NoError(t, yaml.Unmarshal(yml, &root)) + + var seed lowArazzo.Arazzo + require.NoError(t, low.BuildModel(root.Content[0], &seed)) + + arazzoType := reflect.TypeOf(lowArazzo.Arazzo{}) + original, ok := arazzoLowBuildModelFieldCache.Load(arazzoType) + require.True(t, ok) + + origType := reflect.TypeOf(original) + elemType := origType.Elem() + replacement := reflect.MakeSlice(origType, 1, 1) + elem := reflect.New(elemType).Elem() + setArazzoUnexportedField(elem.FieldByName("lookupKey"), "arazzo") + setArazzoUnexportedField(elem.FieldByName("index"), 0) + setArazzoUnexportedField(elem.FieldByName("kind"), reflect.Bool) + replacement.Index(0).Set(elem) + + arazzoLowBuildModelFieldCache.Store(arazzoType, replacement.Interface()) + t.Cleanup(func() { + arazzoLowBuildModelFieldCache.Store(arazzoType, original) + }) + + doc, err := NewArazzoDocument(yml) + assert.Error(t, err) + assert.Nil(t, doc) + assert.Contains(t, err.Error(), "failed to build low-level model") + assert.Contains(t, err.Error(), "unsupported type") +} + +func TestNewArazzoDocument_BuildError(t *testing.T) { + yml := []byte(`arazzo: 1.0.1 +info: + title: Build Error + version: 1.0.0 +sourceDescriptions: + - name: api + url: https://example.com/openapi.yaml +workflows: + - workflowId: wf1 + steps: + - stepId: s1 + operationId: op1 +components: + failureActions: + badRetry: + name: retry + type: retry + retryAfter: nope +`) + doc, err := NewArazzoDocument(yml) + assert.Error(t, err) + assert.Nil(t, doc) + assert.Contains(t, err.Error(), "failed to build arazzo document") + assert.Contains(t, err.Error(), "invalid retryAfter") +} + func TestNewArazzoDocument_MultipleWorkflows(t *testing.T) { yml := []byte(`arazzo: 1.0.1 info: @@ -467,3 +537,7 @@ workflows: assert.Len(t, doc2.Workflows, len(doc1.Workflows)) assert.Equal(t, doc1.Workflows[0].WorkflowId, doc2.Workflows[0].WorkflowId) } + +func setArazzoUnexportedField(field reflect.Value, value any) { + reflect.NewAt(field.Type(), unsafe.Pointer(field.UnsafeAddr())).Elem().Set(reflect.ValueOf(value)) +} diff --git a/datamodel/document_config.go b/datamodel/document_config.go index 9c2df0c6..3a431005 100644 --- a/datamodel/document_config.go +++ b/datamodel/document_config.go @@ -101,11 +101,16 @@ type DocumentConfiguration struct { // passed in and used. Only enable this when parsing non openapi documents. BypassDocumentCheck bool - // SkipJSONConversion skips the YAML-to-JSON conversion during spec parsing. - // SpecJSON and SpecJSONBytes on SpecInfo will be nil when enabled. - // This also skips structural validation that parseJSON performs (e.g., duplicate key detection). + // SkipJSONConversion disables the JSON representation of the document entirely: + // SpecInfo.GetSpecJSON and GetSpecJSONBytes return nil when enabled (and the + // deprecated SpecJSON/SpecJSONBytes fields stay nil). This also skips the eager + // structural validation performed at parse time (e.g., duplicate key detection). // Safe when document-level schema validation rules are not running and no custom // functions depend on the JSON representation. + // + // Note: the JSON representation is built lazily on first accessor call, so leaving + // this disabled no longer costs anything at parse time. Enable it only to also + // skip the eager structural validation, or to guarantee accessors return nil. SkipJSONConversion bool // IgnorePolymorphicCircularReferences will skip over checking for circular references in polymorphic schemas. @@ -139,6 +144,20 @@ type DocumentConfiguration struct { // defaults to false (which means extensions will be included) ExcludeExtensionRefs bool + // SkipMetadataCollection disables the collection of diagnostic metadata during indexing: + // descriptions, summaries, enums, objects-with-properties, security requirement + // references, and the JSONPath `Path` values on inline schema references. Skipping + // them significantly reduces allocations and retained memory when parsing large + // documents. Reference extraction, resolution and model building are unaffected. + // + // -- UNSAFE FOR DIAGNOSTIC, RULE, OR PATH CONSUMERS -- + // When enabled, the index methods GetAllDescriptions, GetAllSummaries, GetAllEnums, + // GetAllObjectsWithProperties, GetSecurityRequirementReferences and the related + // counts are intentionally empty/zero, and inline schema Reference.Path values are + // empty strings. vacuum and any other tool that consumes index metadata or Path + // values must NOT enable this. Defaults to false (everything is collected). + SkipMetadataCollection bool + // BundleInlineRefs controls whether local component references are inlined during bundling. // When false (default): Local refs like #/components/schemas/Pet are preserved // When true: Local refs are also inlined (may break discriminator mappings) diff --git a/datamodel/high/base/schema.go b/datamodel/high/base/schema.go index f6577105..6a720808 100644 --- a/datamodel/high/base/schema.go +++ b/datamodel/high/base/schema.go @@ -367,50 +367,21 @@ func NewSchema(schema *base.Schema) *Schema { } s.Enum = enum - // async work. - // any polymorphic properties need to be handled in their own threads - // any properties each need to be processed in their own thread. - // we go as fast as we can. - polyCompletedChan := make(chan struct{}) - errChan := make(chan error) - - type buildResult struct { - idx int - s *SchemaProxy - } - - // for every item, build schema async - buildSchema := func(sch lowmodel.ValueReference[*base.SchemaProxy], idx int, bChan chan buildResult) { - n := &lowmodel.NodeReference[*base.SchemaProxy]{ - ValueNode: sch.ValueNode, - Value: sch.Value, - } - n.SetReference(sch.GetReference(), sch.GetReferenceNode()) - - p := NewSchemaProxy(n) - - bChan <- buildResult{idx: idx, s: p} - } - - // schema async - buildOutSchemas := func(schemas []lowmodel.ValueReference[*base.SchemaProxy], items *[]*SchemaProxy, - doneChan chan struct{}, e chan error, - ) { - bChan := make(chan buildResult) - totalSchemas := len(schemas) + // each item is a single SchemaProxy struct construction: spinning up goroutines + // and channels per item costs far more than the work itself, so build inline. + buildOutSchemas := func(schemas []lowmodel.ValueReference[*base.SchemaProxy]) []*SchemaProxy { + items := make([]*SchemaProxy, len(schemas)) for i := range schemas { - go buildSchema(schemas[i], i, bChan) - } - j := 0 - for j < totalSchemas { - r := <-bChan - j++ - (*items)[r.idx] = r.s + n := &lowmodel.NodeReference[*base.SchemaProxy]{ + ValueNode: schemas[i].ValueNode, + Value: schemas[i].Value, + } + n.SetReference(schemas[i].GetReference(), schemas[i].GetReferenceNode()) + items[i] = NewSchemaProxy(n) } - doneChan <- struct{}{} + return items } - // props async buildProps := func(k lowmodel.KeyReference[string], v lowmodel.ValueReference[*base.SchemaProxy], props *orderedmap.Map[string, *SchemaProxy], sw int, ) { @@ -470,21 +441,14 @@ func NewSchema(schema *base.Schema) *Schema { var items *DynamicValue[*SchemaProxy, bool] var prefixItems []*SchemaProxy - children := 0 if !schema.AllOf.IsEmpty() { - children++ - allOf = make([]*SchemaProxy, len(schema.AllOf.Value)) - go buildOutSchemas(schema.AllOf.Value, &allOf, polyCompletedChan, errChan) + allOf = buildOutSchemas(schema.AllOf.Value) } if !schema.AnyOf.IsEmpty() { - children++ - anyOf = make([]*SchemaProxy, len(schema.AnyOf.Value)) - go buildOutSchemas(schema.AnyOf.Value, &anyOf, polyCompletedChan, errChan) + anyOf = buildOutSchemas(schema.AnyOf.Value) } if !schema.OneOf.IsEmpty() { - children++ - oneOf = make([]*SchemaProxy, len(schema.OneOf.Value)) - go buildOutSchemas(schema.OneOf.Value, &oneOf, polyCompletedChan, errChan) + oneOf = buildOutSchemas(schema.OneOf.Value) } if !schema.Not.IsEmpty() { not = NewSchemaProxy(&schema.Not) @@ -504,21 +468,7 @@ func NewSchema(schema *base.Schema) *Schema { } } if !schema.PrefixItems.IsEmpty() { - children++ - prefixItems = make([]*SchemaProxy, len(schema.PrefixItems.Value)) - go buildOutSchemas(schema.PrefixItems.Value, &prefixItems, polyCompletedChan, errChan) - } - - completeChildren := 0 - if children > 0 { - allDone: - for { - <-polyCompletedChan - completeChildren++ - if children == completeChildren { - break allDone - } - } + prefixItems = buildOutSchemas(schema.PrefixItems.Value) } s.OneOf = oneOf s.AnyOf = anyOf diff --git a/datamodel/high/base/schema_proxy.go b/datamodel/high/base/schema_proxy.go index adc9afb5..53ecc1f5 100644 --- a/datamodel/high/base/schema_proxy.go +++ b/datamodel/high/base/schema_proxy.go @@ -175,12 +175,12 @@ type SchemaProxy struct { buildError error rendered *Schema refStr string - lock *sync.Mutex + lock sync.Mutex } // NewSchemaProxy creates a new high-level SchemaProxy from a low-level one. func NewSchemaProxy(schema *low.NodeReference[*base.SchemaProxy]) *SchemaProxy { - return &SchemaProxy{schema: schema, lock: &sync.Mutex{}} + return &SchemaProxy{schema: schema} } // copySchemaWithParentProxy creates a shallow copy of a schema and sets the ParentProxy @@ -193,13 +193,13 @@ func (sp *SchemaProxy) copySchemaWithParentProxy(schema *Schema) *Schema { // CreateSchemaProxy will create a new high-level SchemaProxy from a high-level Schema, this acts the same // as if the SchemaProxy is pre-rendered. func CreateSchemaProxy(schema *Schema) *SchemaProxy { - return &SchemaProxy{rendered: schema, lock: &sync.Mutex{}} + return &SchemaProxy{rendered: schema} } // CreateSchemaProxyRef will create a new high-level SchemaProxy from a reference string, this is used only when // building out new models from scratch that require a reference rather than a schema implementation. func CreateSchemaProxyRef(ref string) *SchemaProxy { - return &SchemaProxy{refStr: ref, lock: &sync.Mutex{}} + return &SchemaProxy{refStr: ref} } // CreateSchemaProxyRefWithSchema creates a SchemaProxy that carries both a $ref and sibling schema @@ -208,7 +208,7 @@ func CreateSchemaProxyRef(ref string) *SchemaProxy { // // If schema is nil, the result behaves identically to CreateSchemaProxyRef. func CreateSchemaProxyRefWithSchema(ref string, schema *Schema) *SchemaProxy { - return &SchemaProxy{refStr: ref, rendered: schema, lock: &sync.Mutex{}} + return &SchemaProxy{refStr: ref, rendered: schema} } // GetValueNode returns the value node of the SchemaProxy. @@ -228,7 +228,7 @@ func (sp *SchemaProxy) GetValueNode() *yaml.Node { // instead for proxies created using NewSchemaProxy or CreateSchema* methods. // https://github.com/pb33f/libopenapi/issues/403 func (sp *SchemaProxy) Schema() *Schema { - if sp == nil || sp.lock == nil { + if sp == nil { return nil } @@ -376,9 +376,6 @@ func (sp *SchemaProxy) BuildSchema() (*Schema, error) { return nil, nil } schema := sp.Schema() - if sp.lock == nil { - return schema, sp.buildError - } sp.lock.Lock() er := sp.buildError sp.lock.Unlock() @@ -390,9 +387,6 @@ func (sp *SchemaProxy) GetBuildError() error { if sp == nil { return nil } - if sp.lock == nil { - return sp.buildError - } sp.lock.Lock() err := sp.buildError sp.lock.Unlock() diff --git a/datamodel/high/base/schema_proxy_test.go b/datamodel/high/base/schema_proxy_test.go index 5a0a06fc..84ec3e6d 100644 --- a/datamodel/high/base/schema_proxy_test.go +++ b/datamodel/high/base/schema_proxy_test.go @@ -72,9 +72,12 @@ func TestCreateSchemaProxy_Fail(t *testing.T) { } func TestSchemaProxy_Schema_NoLowLevel(t *testing.T) { - proxy := &SchemaProxy{ - lock: &sync.Mutex{}, - } + proxy := &SchemaProxy{} + assert.Nil(t, proxy.Schema()) +} + +func TestSchemaProxy_Schema_NilProxy(t *testing.T) { + var proxy *SchemaProxy assert.Nil(t, proxy.Schema()) } @@ -868,7 +871,6 @@ func TestMarshalYAMLInline_CircularReferenceDetection_WithReference(t *testing.T ref := "#/components/schemas/CircularTest" proxy := &SchemaProxy{ refStr: ref, - lock: &sync.Mutex{}, } // Pre-load the render key to simulate being mid-render (cycle detected) @@ -959,7 +961,6 @@ func TestGetInlineRenderKey_ReferenceWithoutIndex(t *testing.T) { proxy := &SchemaProxy{ schema: &lowRef, - lock: &sync.Mutex{}, } renderKey := proxy.getInlineRenderKey() @@ -974,7 +975,6 @@ func TestGetInlineRenderKey_NilSchemaReturnsRefStr(t *testing.T) { proxy := &SchemaProxy{ refStr: "#/components/schemas/EarlyReturn", - lock: &sync.Mutex{}, } renderKey := proxy.getInlineRenderKey() @@ -994,7 +994,6 @@ func TestGetInlineRenderKey_NilSchemaValueReturnsRefStr(t *testing.T) { proxy := &SchemaProxy{ refStr: "#/components/schemas/AnotherEarlyReturn", schema: &lowRef, - lock: &sync.Mutex{}, } renderKey := proxy.getInlineRenderKey() @@ -1076,7 +1075,6 @@ func TestMarshalYAMLInlineWithContext_PreserveReference_ViaLowLevel(t *testing.T proxy := &SchemaProxy{ schema: &lowRef, - lock: &sync.Mutex{}, } // create context and mark the ref as preserved @@ -1137,7 +1135,6 @@ func TestMarshalYAMLInline_BundlingMode_ViaLowLevelRef(t *testing.T) { proxy := &SchemaProxy{ schema: &lowRef, - lock: &sync.Mutex{}, } // Enable bundling mode diff --git a/datamodel/low/extraction_functions.go b/datamodel/low/extraction_functions.go index f8f86d03..222aeec4 100644 --- a/datamodel/low/extraction_functions.go +++ b/datamodel/low/extraction_functions.go @@ -1100,13 +1100,17 @@ func ExtractExtensions(root *yaml.Node) *orderedmap.Map[KeyReference[string], Va if root == nil { return nil } - extensions := utils.FindExtensionNodes(root.Content) extensionMap := orderedmap.New[KeyReference[string], ValueReference[*yaml.Node]]() - for _, ext := range extensions { - extensionMap.Set(KeyReference[string]{ - Value: ext.Key.Value, - KeyNode: ext.Key, - }, ValueReference[*yaml.Node]{Value: ext.Value, ValueNode: ext.Value}) + content := root.Content + for i := 0; i+1 < len(content); i += 2 { + key := content[i] + if strings.HasPrefix(key.Value, "x-") { + value := utils.NodeAlias(content[i+1]) + extensionMap.Set(KeyReference[string]{ + Value: key.Value, + KeyNode: key, + }, ValueReference[*yaml.Node]{Value: value, ValueNode: value}) + } } return extensionMap } diff --git a/datamodel/low/model_builder.go b/datamodel/low/model_builder.go index 4cebe905..7b721b8f 100644 --- a/datamodel/low/model_builder.go +++ b/datamodel/low/model_builder.go @@ -9,6 +9,7 @@ import ( "strconv" "strings" "sync" + "unicode/utf8" "github.com/pb33f/libopenapi/orderedmap" "github.com/pb33f/libopenapi/utils" @@ -48,6 +49,19 @@ func buildModelFields(modelType reflect.Type) []buildModelField { return actual.([]buildModelField) } +// lowerIfNeeded returns the input unchanged when it contains no uppercase ASCII and no +// multibyte runes (the overwhelmingly common case for OpenAPI keys), avoiding the +// per-key allocation of strings.ToLower on the BuildModel hot path. +func lowerIfNeeded(s string) string { + for i := 0; i < len(s); i++ { + c := s[i] + if c >= utf8.RuneSelf || ('A' <= c && c <= 'Z') { + return strings.ToLower(s) + } + } + return s +} + // BuildModel accepts a yaml.Node pointer and a model, which can be any struct. Using reflection, the model is // analyzed and the names of all the properties are extracted from the model and subsequently looked up from within // the yaml.Node.Content value. @@ -70,7 +84,7 @@ func BuildModel(node *yaml.Node, model interface{}) error { content := node.Content keyMap := make(map[string]int, len(content)/2) for j := 0; j < len(content)-1; j += 2 { - k := strings.ToLower(utils.NodeAlias(content[j]).Value) + k := lowerIfNeeded(utils.NodeAlias(content[j]).Value) if _, exists := keyMap[k]; !exists { keyMap[k] = j } diff --git a/datamodel/low/v2/swagger.go b/datamodel/low/v2/swagger.go index 0d5f668e..7e5211b0 100644 --- a/datamodel/low/v2/swagger.go +++ b/datamodel/low/v2/swagger.go @@ -152,6 +152,7 @@ func createDocument(info *datamodel.SpecInfo, config *datamodel.DocumentConfigur idxConfig.BasePath = config.BasePath idxConfig.Logger = config.Logger idxConfig.ExcludeExtensionRefs = config.ExcludeExtensionRefs + idxConfig.SkipMetadataCollection = config.SkipMetadataCollection rolodex := index.NewRolodex(idxConfig) rolodex.SetRootNode(info.RootNode) doc.Rolodex = rolodex diff --git a/datamodel/low/v2/swagger_test.go b/datamodel/low/v2/swagger_test.go index 1ef93ed1..03e09f84 100644 --- a/datamodel/low/v2/swagger_test.go +++ b/datamodel/low/v2/swagger_test.go @@ -69,6 +69,24 @@ func TestCreateDocument(t *testing.T) { assert.Equal(t, 1, orderedmap.Len(doc.GetExtensions())) } +func TestCreateDocument_SkipMetadataCollection_Propagates(t *testing.T) { + data, _ := os.ReadFile("../../../test_specs/petstorev2-complete.yaml") + + info, _ := datamodel.ExtractSpecInfo(data) + cfg := datamodel.NewDocumentConfiguration() + cfg.SkipMetadataCollection = true + skipDoc, err := CreateDocumentFromConfig(info, cfg) + assert.NoError(t, err) + assert.True(t, skipDoc.Index.GetConfig().SkipMetadataCollection) + assert.Empty(t, skipDoc.Index.GetAllDescriptions()) + + info, _ = datamodel.ExtractSpecInfo(data) + fullDoc, err := CreateDocumentFromConfig(info, datamodel.NewDocumentConfiguration()) + assert.NoError(t, err) + assert.False(t, fullDoc.Index.GetConfig().SkipMetadataCollection) + assert.NotEmpty(t, fullDoc.Index.GetAllDescriptions()) +} + func TestCreateDocument_Info(t *testing.T) { initTest() assert.Equal(t, "Swagger Petstore", doc.Info.Value.Title.Value) diff --git a/datamodel/low/v3/create_document.go b/datamodel/low/v3/create_document.go index 7ae8f838..74a2a544 100644 --- a/datamodel/low/v3/create_document.go +++ b/datamodel/low/v3/create_document.go @@ -151,6 +151,7 @@ func createDocument(info *datamodel.SpecInfo, config *datamodel.DocumentConfigur idxConfig.SpecInfo = info idxConfig.UseSchemaQuickHash = config.UseSchemaQuickHash idxConfig.ExcludeExtensionRefs = config.ExcludeExtensionRefs + idxConfig.SkipMetadataCollection = config.SkipMetadataCollection idxConfig.IgnoreArrayCircularReferences = config.IgnoreArrayCircularReferences idxConfig.IgnorePolymorphicCircularReferences = config.IgnorePolymorphicCircularReferences idxConfig.AllowUnknownExtensionContentDetection = config.AllowUnknownExtensionContentDetection diff --git a/datamodel/low/v3/create_document_test.go b/datamodel/low/v3/create_document_test.go index 9eec996d..0c9b3423 100644 --- a/datamodel/low/v3/create_document_test.go +++ b/datamodel/low/v3/create_document_test.go @@ -386,9 +386,11 @@ func BenchmarkCreateDocument_Stripe(b *testing.B) { info, _ := datamodel.ExtractSpecInfo(data) for i := 0; i < b.N; i++ { - _, err := CreateDocumentFromConfig(info, &datamodel.DocumentConfiguration{}) - if err != nil { - panic("this should not error") + // stripe.yaml contains circular references, so an error is expected + // with the default configuration; only a nil document is a failure. + doc, _ := CreateDocumentFromConfig(info, &datamodel.DocumentConfiguration{}) + if doc == nil { + panic("document should not be nil") } } } @@ -424,6 +426,31 @@ func TestCreateDocument(t *testing.T) { assert.Equal(t, 1, orderedmap.Len(doc.GetExtensions())) } +func TestCreateDocument_SkipMetadataCollection_Propagates(t *testing.T) { + spec := []byte(`openapi: 3.1.0 +info: + title: skip metadata + description: a description that would normally be collected + version: 1.0.0 +paths: {}`) + + info, err := datamodel.ExtractSpecInfo(spec) + assert.NoError(t, err) + + skipDoc, err := CreateDocumentFromConfig(info, &datamodel.DocumentConfiguration{ + SkipMetadataCollection: true, + }) + assert.NoError(t, err) + assert.True(t, skipDoc.Index.GetConfig().SkipMetadataCollection) + assert.Empty(t, skipDoc.Index.GetAllDescriptions()) + + info, _ = datamodel.ExtractSpecInfo(spec) + fullDoc, err := CreateDocumentFromConfig(info, &datamodel.DocumentConfiguration{}) + assert.NoError(t, err) + assert.False(t, fullDoc.Index.GetConfig().SkipMetadataCollection) + assert.NotEmpty(t, fullDoc.Index.GetAllDescriptions()) +} + func TestCreateDocument_DeprecatedWrapper(t *testing.T) { spec := []byte(`openapi: 3.1.0 info: diff --git a/datamodel/spec_info.go b/datamodel/spec_info.go index 8a2e7e0b..bfb51f3a 100644 --- a/datamodel/spec_info.go +++ b/datamodel/spec_info.go @@ -9,6 +9,7 @@ import ( "errors" "fmt" "strings" + "sync" "time" "unicode/utf16" "unicode/utf8" @@ -31,18 +32,99 @@ type SpecInfo struct { VersionNumeric float32 `json:"versionNumeric"` SpecFormat string `json:"format"` SpecFileType string `json:"fileType"` - SpecBytes *[]byte `json:"bytes"` // the original byte array - RootNode *yaml.Node `json:"-"` // reference to the root node of the spec. - SpecJSONBytes *[]byte `json:"-"` // original bytes converted to JSON - SpecJSON *map[string]interface{} `json:"-"` // standard JSON map of original bytes - Error error `json:"-"` // something go wrong? - APISchema string `json:"-"` // API Schema for supplied spec type (2 or 3) - Generated time.Time `json:"-"` - OriginalIndentation int `json:"-"` // the original whitespace - Self string `json:"-"` // the $self field for OpenAPI 3.2+ documents (base URI) + SpecBytes *[]byte `json:"bytes"` // the original byte array + RootNode *yaml.Node `json:"-"` // reference to the root node of the spec. + + // SpecJSONBytes is the original document converted to JSON. It is populated lazily. + // + // Deprecated: read via GetSpecJSONBytes(), which builds the JSON representation on + // first use. This field is nil until GetSpecJSON or GetSpecJSONBytes is called. + // Concurrent readers must use the accessors: a direct field read racing the first + // accessor call is a data race. + SpecJSONBytes *[]byte `json:"-"` + + // SpecJSON is the original document as a standard JSON map. It is populated lazily. + // + // Deprecated: read via GetSpecJSON(), which builds the JSON representation on + // first use. This field is nil until GetSpecJSON or GetSpecJSONBytes is called. + // Concurrent readers must use the accessors: a direct field read racing the first + // accessor call is a data race. + SpecJSON *map[string]interface{} `json:"-"` + + Error error `json:"-"` // something go wrong? + APISchema string `json:"-"` // API Schema for supplied spec type (2 or 3) + Generated time.Time `json:"-"` + OriginalIndentation int `json:"-"` // the original whitespace + Self string `json:"-"` // the $self field for OpenAPI 3.2+ documents (base URI) + + jsonOnce sync.Once // guards the lazy JSON build + jsonErr error // error from the lazy JSON build, if any + skipJSONBuild bool // set by SkipJSONConversion: accessors return nil, preserving the flag's contract +} + +// GetSpecJSON returns the document as a standard JSON map, building it on first call. +// Returns nil if the document cannot be converted, or if the SpecInfo has been +// released (the node tree and original bytes are required to build the JSON view). +func (s *SpecInfo) GetSpecJSON() *map[string]interface{} { + s.jsonOnce.Do(s.buildJSON) + return s.SpecJSON +} + +// GetSpecJSONBytes returns the document converted to JSON bytes, building the +// representation on first call. Returns nil if the document cannot be converted, or +// if the SpecInfo has been released. +func (s *SpecInfo) GetSpecJSONBytes() *[]byte { + s.jsonOnce.Do(s.buildJSON) + return s.SpecJSONBytes +} + +// GetSpecJSONError returns the error from building the JSON view, building it first +// if needed. It distinguishes a failed conversion (non-nil error) from a released +// SpecInfo (nil error, nil JSON: there is nothing left to build, so nothing failed). +func (s *SpecInfo) GetSpecJSONError() error { + s.jsonOnce.Do(s.buildJSON) + return s.jsonErr +} + +// buildJSON populates SpecJSON and SpecJSONBytes from the parsed node tree (YAML) or +// the original bytes (JSON). This is the same conversion the library used to perform +// eagerly on every parse; it now runs only when the JSON view is requested. +func (s *SpecInfo) buildJSON() { + if s.skipJSONBuild { + return + } + var jsonSpec map[string]interface{} + if s.SpecFileType == YAMLFileType { + if s.RootNode == nil { + return + } + if err := s.RootNode.Decode(&jsonSpec); err != nil { + s.jsonErr = fmt.Errorf("failed to decode YAML to JSON: %w", err) + return + } + // Marshal to JSON - if this fails due to unsupported types (e.g. map[interface{}]interface{}), + // we tolerate it as it doesn't indicate spec invalidity, just YAML/JSON incompatibility + b, err := json.Marshal(&jsonSpec) + if err == nil { + s.SpecJSONBytes = &b + } + s.SpecJSON = &jsonSpec + return + } + if s.SpecBytes == nil { + return + } + if err := json.Unmarshal(*s.SpecBytes, &jsonSpec); err != nil { + s.jsonErr = fmt.Errorf("failed to unmarshal JSON: %w", err) + return + } + s.SpecJSONBytes = s.SpecBytes + s.SpecJSON = &jsonSpec } // Release nils fields that pin the YAML node tree and large byte arrays in memory. +// After release, GetSpecJSON and GetSpecJSONBytes return nil: the inputs needed to +// build the JSON view are gone and will not be resurrected. func (s *SpecInfo) Release() { if s == nil { return @@ -81,7 +163,7 @@ func ExtractSpecInfoWithDocumentCheck(spec []byte, bypass bool) (*SpecInfo, erro func extractSpecInfoInternal(spec []byte, bypass bool, skipJSON bool) (*SpecInfo, error) { var parsedSpec yaml.Node - specInfo := &SpecInfo{} + specInfo := &SpecInfo{skipJSONBuild: skipJSON} // set original bytes specInfo.SpecBytes = &spec @@ -132,28 +214,38 @@ func extractSpecInfoInternal(spec []byte, bypass bool, skipJSON bool) (*SpecInfo _, openAPI2 := utils.FindKeyNode(utils.OpenApi2, parsedSpec.Content) _, asyncAPI := utils.FindKeyNode(utils.AsyncApi, parsedSpec.Content) + // validate document structure without building the JSON view: the root must be a + // mapping, YAML mappings must not contain duplicate keys (matching the yaml + // decoder's checks), and JSON input must be syntactically valid. The full JSON + // conversion is built lazily by GetSpecJSON/GetSpecJSONBytes. parseJSON := func(bytes []byte, spec *SpecInfo, parsedNode *yaml.Node) error { - var jsonSpec map[string]interface{} if spec.SpecFileType == YAMLFileType { - // Decode YAML to map - this is critical to catch structural errors like duplicate keys - if err := parsedNode.Decode(&jsonSpec); err != nil { - return fmt.Errorf("failed to decode YAML to JSON: %w", err) + root := parsedNode + if root.Kind == yaml.DocumentNode && len(root.Content) > 0 { + root = root.Content[0] } - // Marshal to JSON - if this fails due to unsupported types (e.g. map[interface{}]interface{}), - // we tolerate it as it doesn't indicate spec invalidity, just YAML/JSON incompatibility - b, err := json.Marshal(&jsonSpec) - if err == nil { - spec.SpecJSONBytes = &b + if root.Kind != yaml.MappingNode && root.Tag != "!!null" { + // the document cannot construct into a map: run the decoder purely to + // surface its exact construct error (garbage input is the rare path). + var jsonSpec map[string]interface{} + if err := parsedNode.Decode(&jsonSpec); err != nil { + return fmt.Errorf("failed to decode YAML to JSON: %w", err) + } } - spec.SpecJSON = &jsonSpec - } else { - if err := json.Unmarshal(bytes, &jsonSpec); err != nil { - return fmt.Errorf("failed to unmarshal JSON: %w", err) + if err := checkDuplicateMappingKeys(parsedNode); err != nil { + return fmt.Errorf("failed to decode YAML to JSON: %w", err) } - spec.SpecJSONBytes = &bytes - spec.SpecJSON = &jsonSpec + return nil } - return nil + if json.Valid(bytes) { + return nil + } + // invalid JSON is the rare path: run the decoder purely to surface the same + // detailed error message the eager conversion produced. json.Valid and + // json.Unmarshal share the same scanner, so the decode always errors here. + var jsonSpec map[string]interface{} + err := json.Unmarshal(bytes, &jsonSpec) + return fmt.Errorf("failed to unmarshal JSON: %w", err) } // if !bypass { @@ -316,6 +408,64 @@ func ExtractSpecInfo(spec []byte) (*SpecInfo, error) { return ExtractSpecInfoWithDocumentCheck(spec, false) } +// checkDuplicateMappingKeys walks a parsed node tree and reports duplicate mapping +// keys using the exact equality semantics of the go.yaml.in/yaml/v4 decoder: two keys +// in the same mapping collide when their node Kind and raw Value match (no tag or +// alias resolution). Error text matches the decoder's construct errors byte for byte, +// and children of an offending mapping are not descended into, mirroring the decoder +// halting construction of that mapping. +// +// Known divergence: an anchored mapping with duplicate keys that is aliased +// elsewhere is reported ONCE here, while the decoder re-reports it on every +// construction visit (anchor + each alias). The decoder's repeat count is an +// artifact of construction order, not extra information, so the walker does +// not replicate it. TestCheckDuplicateMappingKeys_MatchesDecoder pins parity +// for everything else (and TestCheckDuplicateMappingKeys_AliasedAnchorDivergence +// pins this exception); revisit both when go.yaml.in/yaml/v4 leaves rc. +func checkDuplicateMappingKeys(node *yaml.Node) error { + var errs []string + walkDuplicateMappingKeys(node, &errs) + if len(errs) == 0 { + return nil + } + var b strings.Builder + b.WriteString("yaml: construct errors:") + for _, e := range errs { + b.WriteString("\n ") + b.WriteString(e) + } + return errors.New(b.String()) +} + +func walkDuplicateMappingKeys(node *yaml.Node, errs *[]string) { + switch node.Kind { + case yaml.DocumentNode, yaml.SequenceNode: + for _, child := range node.Content { + walkDuplicateMappingKeys(child, errs) + } + case yaml.MappingNode: + l := len(node.Content) + found := false + for i := 0; i < l; i += 2 { + ni := node.Content[i] + for j := i + 2; j < l; j += 2 { + nj := node.Content[j] + if ni.Kind == nj.Kind && ni.Value == nj.Value { + *errs = append(*errs, + fmt.Sprintf("line %d: mapping key %#v already defined at line %d", nj.Line, nj.Value, ni.Line)) + found = true + } + } + } + if found { + return + } + for _, child := range node.Content { + walkDuplicateMappingKeys(child, errs) + } + } +} + // extract version number from specification func parseVersionTypeData(d interface{}) (string, int, error) { r := []rune(strings.TrimSpace(fmt.Sprintf("%v", d))) diff --git a/datamodel/spec_info_duplicate_keys_test.go b/datamodel/spec_info_duplicate_keys_test.go new file mode 100644 index 00000000..12535b2e --- /dev/null +++ b/datamodel/spec_info_duplicate_keys_test.go @@ -0,0 +1,106 @@ +// Copyright 2026 Princess B33f Heavy Industries / Dave Shanley +// SPDX-License-Identifier: MIT + +package datamodel + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.yaml.in/yaml/v4" +) + +// TestCheckDuplicateMappingKeys_MatchesDecoder is a differential test: every case is +// run through both checkDuplicateMappingKeys and the yaml v4 decoder (the previous +// source of duplicate-key errors). The walker must agree with the decoder on whether +// an error occurs AND on the exact construct error text. +func TestCheckDuplicateMappingKeys_MatchesDecoder(t *testing.T) { + cases := []struct { + name string + yml string + }{ + {"no duplicates", "a: 1\nb: 2\nc:\n d: 3\n e: 4\n"}, + {"simple duplicate", "a: 1\nb: 2\na: 3\n"}, + {"nested duplicate", "root:\n x: 1\n y: 2\n x: 3\n"}, + {"duplicate inside sequence", "items:\n - k: 1\n k: 2\n - ok: 1\n"}, + {"multiple duplicates", "a: 1\na: 2\nb: 3\nb: 4\n"}, + {"triple duplicate", "a: 1\na: 2\na: 3\n"}, + {"alias keys", "anchored: &k value\nmap:\n *k : 1\n *k : 2\n"}, + {"alias key vs literal", "anchored: &k value\nmap:\n *k : 1\n value: 2\n"}, + {"merge keys", "base: &base\n a: 1\nuses:\n <<: *base\n a: 2\n"}, + {"duplicate merge keys", "b1: &b1\n a: 1\nb2: &b2\n b: 2\nuses:\n <<: *b1\n <<: *b2\n"}, + {"tagged int vs plain", "m:\n !!str 1: a\n 1: b\n"}, + {"quoted vs plain same value", "m:\n \"true\": a\n true: b\n"}, + {"flow map key", "m:\n {a: 1}: x\n {a: 1}: y\n"}, + {"duplicate under duplicate", "a:\n inner: 1\n inner: 2\na:\n other: 1\n"}, + {"numeric keys", "m:\n 9: a\n 9: b\n"}, + {"empty mapping", "{}\n"}, + {"deeply nested clean", "a:\n b:\n c:\n - d: 1\n e: 2\n"}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + var node yaml.Node + require.NoError(t, yaml.Unmarshal([]byte(tc.yml), &node), "corpus cases must parse") + + // the decoder's view (previous behavior). + var decoded map[string]interface{} + decodeErr := node.Decode(&decoded) + + // the walker's view (new behavior). + walkErr := checkDuplicateMappingKeys(&node) + + if decodeErr != nil { + require.Error(t, walkErr, "decoder errored but walker did not: %s", decodeErr) + assert.Equal(t, decodeErr.Error(), walkErr.Error(), "error text must match decoder byte for byte") + } else { + assert.NoError(t, walkErr, "walker errored but decoder did not") + } + }) + } +} + +// TestCheckDuplicateMappingKeys_AliasedAnchorDivergence pins the one KNOWN, +// INTENTIONAL divergence from the decoder: an anchored mapping with duplicate +// keys that is aliased elsewhere is reported once per definition by the +// walker, but once per construction visit (anchor + each alias) by the +// decoder. The duplicate itself is identical; only the repeat count differs. +// If this test starts failing on the walker side, the decoder's construction +// semantics changed - re-verify the whole corpus above. +func TestCheckDuplicateMappingKeys_AliasedAnchorDivergence(t *testing.T) { + yml := "a: &x {k: 1, k: 2}\nb: *x\nc: *x\n" + var node yaml.Node + require.NoError(t, yaml.Unmarshal([]byte(yml), &node)) + + var decoded map[string]interface{} + decodeErr := node.Decode(&decoded) + walkErr := checkDuplicateMappingKeys(&node) + + require.Error(t, decodeErr) + require.Error(t, walkErr) + + dupLine := `line 1: mapping key "k" already defined at line 1` + // decoder: one report per construction visit (anchor + two aliases). + assert.Equal(t, 3, strings.Count(decodeErr.Error(), dupLine), "decoder reports per visit") + // walker: one report per definition. + assert.Equal(t, 1, strings.Count(walkErr.Error(), dupLine), "walker reports once") +} + +// TestCheckDuplicateMappingKeys_OffendingMappingNotDescended pins the decoder-matching +// behavior that children of a mapping with duplicate keys are not walked: the decoder +// stops constructing that mapping, so nested duplicates below it never surface. +func TestCheckDuplicateMappingKeys_OffendingMappingNotDescended(t *testing.T) { + yml := "a: 1\na:\n x: 1\n x: 2\n" + var node yaml.Node + require.NoError(t, yaml.Unmarshal([]byte(yml), &node)) + + var decoded map[string]interface{} + decodeErr := node.Decode(&decoded) + walkErr := checkDuplicateMappingKeys(&node) + + require.Error(t, decodeErr) + require.Error(t, walkErr) + assert.Equal(t, decodeErr.Error(), walkErr.Error()) +} diff --git a/datamodel/spec_info_lazy_json_test.go b/datamodel/spec_info_lazy_json_test.go new file mode 100644 index 00000000..9973a952 --- /dev/null +++ b/datamodel/spec_info_lazy_json_test.go @@ -0,0 +1,134 @@ +// Copyright 2026 Princess B33f Heavy Industries / Dave Shanley +// SPDX-License-Identifier: MIT + +package datamodel + +import ( + "sync" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +const lazyJSONYAML = `openapi: 3.0.1 +info: + title: lazy + version: 1.0.0 +paths: {}` + +const lazyJSONJSON = `{"openapi":"3.0.1","info":{"title":"lazy","version":"1.0.0"},"paths":{}}` + +func TestSpecInfo_LazyJSON_YAMLInput(t *testing.T) { + r, err := ExtractSpecInfo([]byte(lazyJSONYAML)) + require.NoError(t, err) + + // nothing is built eagerly. + assert.Nil(t, r.SpecJSON) + assert.Nil(t, r.SpecJSONBytes) + + j := r.GetSpecJSON() + require.NotNil(t, j) + assert.Equal(t, "3.0.1", (*j)["openapi"]) + assert.NoError(t, r.GetSpecJSONError()) + + b := r.GetSpecJSONBytes() + require.NotNil(t, b) + assert.Greater(t, len(*b), 0) + + // accessors populate the public fields for mixed readers. + assert.Equal(t, j, r.SpecJSON) + assert.Equal(t, b, r.SpecJSONBytes) +} + +func TestSpecInfo_LazyJSON_JSONInput(t *testing.T) { + r, err := ExtractSpecInfo([]byte(lazyJSONJSON)) + require.NoError(t, err) + + assert.Nil(t, r.SpecJSON) + + j := r.GetSpecJSON() + require.NotNil(t, j) + assert.Equal(t, "3.0.1", (*j)["openapi"]) + + // JSON input: the bytes are the original document, not a copy. + b := r.GetSpecJSONBytes() + require.NotNil(t, b) + assert.Equal(t, r.SpecBytes, b) +} + +func TestSpecInfo_LazyJSON_Concurrent(t *testing.T) { + r, err := ExtractSpecInfo([]byte(lazyJSONYAML)) + require.NoError(t, err) + + var wg sync.WaitGroup + for i := 0; i < 16; i++ { + wg.Add(1) + go func() { + defer wg.Done() + assert.NotNil(t, r.GetSpecJSON()) + assert.NotNil(t, r.GetSpecJSONBytes()) + }() + } + wg.Wait() +} + +func TestSpecInfo_LazyJSON_SkipJSONConversion(t *testing.T) { + r, err := ExtractSpecInfoWithConfig([]byte(lazyJSONYAML), &DocumentConfiguration{ + SkipJSONConversion: true, + }) + require.NoError(t, err) + // the flag's contract holds for accessors too: no JSON representation, ever. + // consumers (e.g. vacuum turbo mode) rely on nil as the "conversion disabled" signal. + assert.Nil(t, r.SpecJSON) + assert.Nil(t, r.GetSpecJSON()) + assert.Nil(t, r.GetSpecJSONBytes()) + assert.NoError(t, r.GetSpecJSONError()) +} + +func TestSpecInfo_LazyJSON_AfterRelease(t *testing.T) { + r, err := ExtractSpecInfo([]byte(lazyJSONYAML)) + require.NoError(t, err) + + r.Release() + + // released: inputs are gone, accessors return nil without panicking and + // nothing is resurrected. no error either - nothing was built, nothing failed. + assert.Nil(t, r.GetSpecJSON()) + assert.Nil(t, r.GetSpecJSONBytes()) + assert.NoError(t, r.GetSpecJSONError()) + assert.Nil(t, r.SpecJSON) + assert.Nil(t, r.SpecJSONBytes) + + // same nil-safety for JSON input, which builds from the original bytes. + rj, err := ExtractSpecInfo([]byte(lazyJSONJSON)) + require.NoError(t, err) + rj.Release() + assert.Nil(t, rj.GetSpecJSON()) + assert.Nil(t, rj.GetSpecJSONBytes()) +} + +func TestSpecInfo_LazyJSON_YAMLDecodeError(t *testing.T) { + // a tagged scalar that cannot decode to its tag passes extraction (only duplicate + // keys are validated eagerly) but fails the lazy build. + spec := "openapi: 3.0.1\ninfo:\n title: lazy\n version: !!int notanint\npaths: {}" + r, err := ExtractSpecInfo([]byte(spec)) + require.NoError(t, err) + + assert.Nil(t, r.GetSpecJSON()) + assert.Nil(t, r.GetSpecJSONBytes()) + assert.ErrorContains(t, r.GetSpecJSONError(), "failed to decode YAML to JSON") +} + +func TestSpecInfo_LazyJSON_JSONUnmarshalError(t *testing.T) { + // bypass lets structurally invalid JSON through extraction; the lazy build + // surfaces the decode failure instead. + bad := `{"openapi": }` + r, err := ExtractSpecInfoWithDocumentCheck([]byte(bad), true) + require.NoError(t, err) + r.SpecFileType = JSONFileType + + // GetSpecJSONError builds on first call, so it works standalone too. + assert.ErrorContains(t, r.GetSpecJSONError(), "failed to unmarshal JSON") + assert.Nil(t, r.GetSpecJSON()) +} diff --git a/datamodel/spec_info_test.go b/datamodel/spec_info_test.go index a2ba2d36..56ec42a5 100644 --- a/datamodel/spec_info_test.go +++ b/datamodel/spec_info_test.go @@ -203,7 +203,7 @@ info: func TestExtractSpecInfo_ValidJSON(t *testing.T) { r, e := ExtractSpecInfo([]byte(goodJSON)) - assert.Greater(t, len(*r.SpecJSONBytes), 0) + assert.Greater(t, len(*r.GetSpecJSONBytes()), 0) assert.Error(t, e) } @@ -219,7 +219,7 @@ func TestExtractSpecInfo_Nothing(t *testing.T) { func TestExtractSpecInfo_ValidYAML(t *testing.T) { r, e := ExtractSpecInfo([]byte(goodYAML)) - assert.Greater(t, len(*r.SpecJSONBytes), 0) + assert.Greater(t, len(*r.GetSpecJSONBytes()), 0) assert.Error(t, e) } @@ -254,7 +254,7 @@ func TestExtractSpecInfo_OpenAPI3(t *testing.T) { assert.Nil(t, e) assert.Equal(t, utils.OpenApi3, r.SpecType) assert.Equal(t, "3.0.1", r.Version) - assert.Greater(t, len(*r.SpecJSONBytes), 0) + assert.Greater(t, len(*r.GetSpecJSONBytes()), 0) assert.Contains(t, r.APISchema, "https://spec.openapis.org/oas/3.0/schema/2021-09-28") } @@ -349,7 +349,7 @@ func TestExtractSpecInfo_OpenAPI2(t *testing.T) { assert.Nil(t, e) assert.Equal(t, OpenApi2, r.SpecType) assert.Equal(t, "2.0.1", r.Version) - assert.Greater(t, len(*r.SpecJSONBytes), 0) + assert.Greater(t, len(*r.GetSpecJSONBytes()), 0) assert.Contains(t, r.APISchema, "http://swagger.io/v2/schema.json#") } @@ -365,7 +365,7 @@ func TestExtractSpecInfo_AsyncAPI(t *testing.T) { assert.Nil(t, e) assert.Equal(t, AsyncApi, r.SpecType) assert.Equal(t, "2.0.0", r.Version) - assert.Greater(t, len(*r.SpecJSONBytes), 0) + assert.Greater(t, len(*r.GetSpecJSONBytes()), 0) } func TestExtractSpecInfo_AsyncAPI_OddVersion(t *testing.T) { diff --git a/datamodel/translate.go b/datamodel/translate.go index a5245c0c..7e8ce644 100644 --- a/datamodel/translate.go +++ b/datamodel/translate.go @@ -38,6 +38,11 @@ type pipelineResult[OUT any] struct { err error } +// parallelTranslateThreshold is the collection size below which TranslateSliceParallel +// and TranslateMapParallel run sequentially: worker pool setup (goroutines, channels, +// pending map) costs more than translating a handful of items inline. +const parallelTranslateThreshold = 16 + // TranslateSliceParallel iterates a slice in parallel and calls translate() // asynchronously. // translate() may return `datamodel.Continue` to continue iteration. @@ -48,6 +53,33 @@ func TranslateSliceParallel[IN any, OUT any](in []IN, translate TranslateSliceFu return nil } + // small collections run inline: same observable semantics, none of the + // worker pool overhead. + if len(in) <= parallelTranslateThreshold { + for i := range in { + out, err := translate(i, in[i]) + if err == Continue { + continue + } + if err != nil { + if err == io.EOF { + return nil + } + return err + } + if result == nil { + continue + } + if err = result(out); err != nil { + if err == io.EOF { + return nil + } + return err + } + } + return nil + } + ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -149,14 +181,33 @@ func TranslateMapParallel[K comparable, V any, RV any](m *orderedmap.Map[K, V], return nil } - // Snapshot pairs for indexed access. + // small maps run inline: same observable semantics, none of the worker + // pool or pair snapshot overhead. + if m.Len() <= parallelTranslateThreshold { + for pair := orderedmap.First(m); pair != nil; pair = pair.Next() { + rv, err := translate(pair) + if err != nil { + if err == io.EOF { + return nil + } + return err + } + if err = result(rv); err != nil { + if err == io.EOF { + return nil + } + return err + } + } + return nil + } + + // Snapshot pairs for indexed access. The map is larger than the sequential + // threshold, so pairs is never empty here. pairs := make([]orderedmap.Pair[K, V], 0, m.Len()) for pair := orderedmap.First(m); pair != nil; pair = pair.Next() { pairs = append(pairs, pair) } - if len(pairs) == 0 { - return nil - } ctx, cancel := context.WithCancel(context.Background()) defer cancel() diff --git a/datamodel/translate_test.go b/datamodel/translate_test.go index 00aa9147..0ee4321d 100644 --- a/datamodel/translate_test.go +++ b/datamodel/translate_test.go @@ -176,6 +176,97 @@ func TestTranslateSliceParallel(t *testing.T) { } } +// TestTranslateSliceParallel_SequentialNilResult covers the inline fast path when no +// result callback is provided (collections at or below the parallel threshold). +func TestTranslateSliceParallel_SequentialNilResult(t *testing.T) { + sl := []int{1, 2, 3} + var translateCounter int64 + translateFunc := func(_, value int) (string, error) { + atomic.AddInt64(&translateCounter, 1) + return "foobar", nil + } + err := datamodel.TranslateSliceParallel[int, string](sl, translateFunc, nil) + require.NoError(t, err) + assert.Equal(t, int64(3), translateCounter) +} + +// TestTranslateMapParallel_Sequential covers every branch of the inline fast path +// taken for maps at or below the parallel threshold. +func TestTranslateMapParallel_Sequential(t *testing.T) { + buildMap := func(n int) *orderedmap.Map[string, int] { + m := orderedmap.New[string, int]() + for i := 0; i < n; i++ { + m.Set(fmt.Sprintf("key%d", i), i) + } + return m + } + + t.Run("Happy path preserves order", func(t *testing.T) { + m := buildMap(5) + var results []string + err := datamodel.TranslateMapParallel[string, int, string](m, + func(pair orderedmap.Pair[string, int]) (string, error) { + return fmt.Sprintf("foobar %d", pair.Value()), nil + }, + func(value string) error { + results = append(results, value) + return nil + }) + require.NoError(t, err) + assert.Equal(t, []string{"foobar 0", "foobar 1", "foobar 2", "foobar 3", "foobar 4"}, results) + }) + + t.Run("Error in translate", func(t *testing.T) { + m := buildMap(3) + err := datamodel.TranslateMapParallel[string, int, string](m, + func(orderedmap.Pair[string, int]) (string, error) { + return "", errors.New("Foobar") + }, + func(string) error { return nil }) + require.ErrorContains(t, err, "Foobar") + }) + + t.Run("EOF in translate", func(t *testing.T) { + m := buildMap(3) + var resultCounter int + err := datamodel.TranslateMapParallel[string, int, string](m, + func(orderedmap.Pair[string, int]) (string, error) { + return "", io.EOF + }, + func(string) error { + resultCounter++ + return nil + }) + require.NoError(t, err) + assert.Zero(t, resultCounter) + }) + + t.Run("Error in result", func(t *testing.T) { + m := buildMap(3) + err := datamodel.TranslateMapParallel[string, int, string](m, + func(pair orderedmap.Pair[string, int]) (string, error) { + return "foobar", nil + }, + func(string) error { return errors.New("Foobar") }) + require.ErrorContains(t, err, "Foobar") + }) + + t.Run("EOF in result", func(t *testing.T) { + m := buildMap(3) + var resultCounter int + err := datamodel.TranslateMapParallel[string, int, string](m, + func(pair orderedmap.Pair[string, int]) (string, error) { + return "foobar", nil + }, + func(string) error { + resultCounter++ + return io.EOF + }) + require.NoError(t, err) + assert.Equal(t, 1, resultCounter) + }) +} + func TestTranslateMapParallel(t *testing.T) { const mapSize = 1000 diff --git a/index/extract_refs_inline.go b/index/extract_refs_inline.go index 3f80fe03..643c8373 100644 --- a/index/extract_refs_inline.go +++ b/index/extract_refs_inline.go @@ -11,6 +11,39 @@ import ( "go.yaml.in/yaml/v4" ) +// buildDefinitionPath assembles absPath + "#/" + seenPath + extra segments joined by +// '/' in a single allocation. The "#/..." definition is a zero-copy suffix slice of +// the result, so callers derive both strings from one build. +func buildDefinitionPath(absPath string, seenPath []string, extra ...string) string { + size := len(absPath) + 2 + for _, s := range seenPath { + size += len(s) + 1 + } + for _, s := range extra { + size += len(s) + 1 + } + var b strings.Builder + b.Grow(size) + b.WriteString(absPath) + b.WriteString("#/") + first := true + for _, s := range seenPath { + if !first { + b.WriteByte('/') + } + b.WriteString(s) + first = false + } + for _, s := range extra { + if !first { + b.WriteByte('/') + } + b.WriteString(s) + first = false + } + return b.String() +} + func (index *SpecIndex) collectInlineSchemaDefinition(parent, node *yaml.Node, seenPath []string, keyIndex int) { if keyIndex+1 >= len(node.Content) { return @@ -21,11 +54,11 @@ func (index *SpecIndex) collectInlineSchemaDefinition(parent, node *yaml.Node, s var jsonPath, definitionPath, fullDefinitionPath string if len(seenPath) > 0 || keyNode.Value != "" { - loc := append(seenPath, keyNode.Value) - locPath := strings.Join(loc, "/") - definitionPath = "#/" + locPath - fullDefinitionPath = index.specAbsolutePath + "#/" + locPath - _, jsonPath = utils.ConvertComponentIdIntoFriendlyPathSearch(definitionPath) + fullDefinitionPath = buildDefinitionPath(index.specAbsolutePath, seenPath, keyNode.Value) + definitionPath = fullDefinitionPath[len(index.specAbsolutePath):] + if !index.skipMetadataCollection() { + _, jsonPath = utils.ConvertComponentIdIntoFriendlyPathSearch(definitionPath) + } } ref := &Reference{ @@ -68,6 +101,7 @@ func (index *SpecIndex) collectMapSchemaDefinitions(parent, node *yaml.Node, see } label := "" + prefix := "" for h, prop := range propertiesNode.Content { if h%2 == 0 { label = prop.Value @@ -76,11 +110,14 @@ func (index *SpecIndex) collectMapSchemaDefinitions(parent, node *yaml.Node, see var jsonPath, definitionPath, fullDefinitionPath string if len(seenPath) > 0 || keyNode.Value != "" && label != "" { - loc := append(seenPath, keyNode.Value, label) - locPath := strings.Join(loc, "/") - definitionPath = "#/" + locPath - fullDefinitionPath = index.specAbsolutePath + "#/" + locPath - _, jsonPath = utils.ConvertComponentIdIntoFriendlyPathSearch(definitionPath) + if prefix == "" { + prefix = buildDefinitionPath(index.specAbsolutePath, seenPath, keyNode.Value) + } + fullDefinitionPath = prefix + "/" + label + definitionPath = fullDefinitionPath[len(index.specAbsolutePath):] + if !index.skipMetadataCollection() { + _, jsonPath = utils.ConvertComponentIdIntoFriendlyPathSearch(definitionPath) + } } ref := &Reference{ @@ -111,17 +148,20 @@ func (index *SpecIndex) collectArraySchemaDefinitions(parent, node *yaml.Node, s keyNode := node.Content[keyIndex] arrayNode := node.Content[keyIndex+1] + prefix := "" for h, element := range arrayNode.Content { var jsonPath, definitionPath, fullDefinitionPath string if len(seenPath) > 0 { - loc := append(seenPath, keyNode.Value, strconv.Itoa(h)) - locPath := strings.Join(loc, "/") - definitionPath = "#/" + locPath - fullDefinitionPath = index.specAbsolutePath + "#/" + locPath - _, jsonPath = utils.ConvertComponentIdIntoFriendlyPathSearch(definitionPath) + if prefix == "" { + prefix = buildDefinitionPath(index.specAbsolutePath, seenPath, keyNode.Value) + } + fullDefinitionPath = prefix + "/" + strconv.Itoa(h) + definitionPath = fullDefinitionPath[len(index.specAbsolutePath):] } else { definitionPath = "#/" + keyNode.Value fullDefinitionPath = index.specAbsolutePath + "#/" + keyNode.Value + } + if !index.skipMetadataCollection() { _, jsonPath = utils.ConvertComponentIdIntoFriendlyPathSearch(definitionPath) } diff --git a/index/extract_refs_lookup.go b/index/extract_refs_lookup.go index 4062daca..288ef768 100644 --- a/index/extract_refs_lookup.go +++ b/index/extract_refs_lookup.go @@ -197,8 +197,9 @@ func (index *SpecIndex) ExtractComponentsFromRefs(ctx context.Context, refs []*R } func (index *SpecIndex) locateRef(ctx context.Context, ref *Reference) *Reference { - uri := strings.Split(ref.FullDefinition, "#/") - isExternalRef := len(uri) == 2 && len(uri[0]) > 0 + // match strings.Split len==2 semantics: exactly one "#/" with a non-empty file part. + refFile, refFragment, refCut := strings.Cut(ref.FullDefinition, "#/") + isExternalRef := refCut && refFile != "" && !strings.Contains(refFragment, "#/") if isExternalRef { index.refLock.Lock() } @@ -221,11 +222,11 @@ func (index *SpecIndex) locateRef(ctx context.Context, ref *Reference) *Referenc } if located.Node != nil { + index.awaitNodeMap() index.nodeMapLock.RLock() - if located.Node.Line > 1 && len(index.nodeMap[located.Node.Line-1]) > 0 { - for _, v := range index.nodeMap[located.Node.Line-1] { - located.KeyNode = v - break + if prevLine := located.Node.Line - 1; prevLine > 0 && prevLine < len(index.nodeLines) { + if entries := index.nodeLines[prevLine]; len(entries) > 0 { + located.KeyNode = entries[0].node } } index.nodeMapLock.RUnlock() diff --git a/index/extract_refs_metadata.go b/index/extract_refs_metadata.go index ce9b82f7..08523779 100644 --- a/index/extract_refs_metadata.go +++ b/index/extract_refs_metadata.go @@ -16,6 +16,13 @@ type metadataPathAction struct { stop bool } +// skipMetadataCollection reports whether diagnostic metadata collection is disabled. +// Only collection is skipped: the path actions extractNodeMetadata returns drive +// seenPath handling for every downstream ref, so that logic always runs. +func (index *SpecIndex) skipMetadataCollection() bool { + return index.config != nil && index.config.SkipMetadataCollection +} + func (index *SpecIndex) extractNodeMetadata(node, parent *yaml.Node, seenPath []string, keyIndex int) metadataPathAction { keyNode := node.Content[keyIndex] if keyNode == nil || keyNode.Value == "" || keyNode.Value == "$ref" || keyNode.Value == "$id" { @@ -31,8 +38,7 @@ func (index *SpecIndex) extractNodeMetadata(node, parent *yaml.Node, seenPath [] var jsonPathComputed bool computeJSONPath := func() string { if !jsonPathComputed { - loc := append(seenPath, segment) - definitionPath := "#/" + strings.Join(loc, "/") + definitionPath := buildDefinitionPath("", seenPath, segment) _, jsonPath = utils.ConvertComponentIdIntoFriendlyPathSearch(definitionPath) jsonPathComputed = true } @@ -47,7 +53,7 @@ func (index *SpecIndex) extractNodeMetadata(node, parent *yaml.Node, seenPath [] if isMetadataPropertyNamePath(seenPath) { return metadataPathAction{appendSegment: true, stop: true} } - if !metadataPathContainsExamples(seenPath) { + if !metadataPathContainsExamples(seenPath) && !index.skipMetadataCollection() { refNode := metadataValueNode(node, keyIndex) ref := &DescriptionReference{ ParentNode: parent, @@ -69,25 +75,33 @@ func (index *SpecIndex) extractNodeMetadata(node, parent *yaml.Node, seenPath [] if metadataPathContainsExamples(seenPath) { return metadataPathAction{stop: true} } - refNode := metadataValueNode(node, keyIndex) - index.allSummaries = append(index.allSummaries, &DescriptionReference{ - ParentNode: parent, - Content: refNode.Value, - Path: computeJSONPath(), - Node: refNode, - KeyNode: keyNode, - IsSummary: true, - }) - index.summaryCount++ + if !index.skipMetadataCollection() { + refNode := metadataValueNode(node, keyIndex) + index.allSummaries = append(index.allSummaries, &DescriptionReference{ + ParentNode: parent, + Content: refNode.Value, + Path: computeJSONPath(), + Node: refNode, + KeyNode: keyNode, + IsSummary: true, + }) + index.summaryCount++ + } case "security": - index.collectSecurityRequirementMetadata(node, keyIndex, computeJSONPath()) + if !index.skipMetadataCollection() { + index.collectSecurityRequirementMetadata(node, keyIndex, computeJSONPath()) + } case "enum": if len(seenPath) > 0 && seenPath[len(seenPath)-1] == "properties" { return metadataPathAction{appendSegment: true, stop: true} } - index.collectEnumMetadata(node, parent, keyIndex, computeJSONPath()) + if !index.skipMetadataCollection() { + index.collectEnumMetadata(node, parent, keyIndex, computeJSONPath()) + } case "properties": - index.collectObjectWithPropertiesMetadata(node, parent, keyNode, computeJSONPath()) + if !index.skipMetadataCollection() { + index.collectObjectWithPropertiesMetadata(node, parent, keyNode, computeJSONPath()) + } } return metadataPathAction{appendSegment: true} diff --git a/index/index_model.go b/index/index_model.go index 32eb0290..5e159d31 100644 --- a/index/index_model.go +++ b/index/index_model.go @@ -227,6 +227,20 @@ type SpecIndexConfig struct { // PropertyMergeStrategy defines how to handle conflicts when merging properties. PropertyMergeStrategy datamodel.PropertyMergeStrategy + // SkipMetadataCollection disables the collection of diagnostic metadata during indexing: + // descriptions, summaries, enums, objects-with-properties, security requirement + // references, and the JSONPath `Path` values on inline schema references. Skipping + // them significantly reduces allocations and retained memory when parsing large + // documents. Reference extraction and resolution are unaffected. + // + // -- UNSAFE FOR DIAGNOSTIC, RULE, OR PATH CONSUMERS -- + // When enabled, GetAllDescriptions, GetAllSummaries, GetAllEnums, + // GetAllObjectsWithProperties, GetSecurityRequirementReferences and the related + // counts are intentionally empty/zero, and inline schema Reference.Path values are + // empty strings. vacuum and any other tool that consumes index metadata or Path + // values must NOT enable this. Defaults to false (everything is collected). + SkipMetadataCollection bool + // private fields uri []string id string @@ -279,6 +293,7 @@ func (s *SpecIndexConfig) ToDocumentConfiguration() *datamodel.DocumentConfigura ResolveNestedRefsWithDocumentContext: s.ResolveNestedRefsWithDocumentContext, PropertyMergeStrategy: strategy, SkipExternalRefResolution: s.SkipExternalRefResolution, + SkipMetadataCollection: s.SkipMetadataCollection, Logger: s.Logger, } } @@ -416,7 +431,8 @@ type SpecIndex struct { built bool uri []string logger *slog.Logger - nodeMap map[int]map[int]*yaml.Node + nodeLines [][]nodeLineEntry + legacyNodeMap map[int]map[int]*yaml.Node // materialized on demand by GetNodeMap only nodeMapCompleted chan struct{} pendingResolve []refMap highModelCache Cache @@ -444,8 +460,30 @@ func (index *SpecIndex) GetConfig() *SpecIndexConfig { } // GetNodeMap returns the line-to-column-to-node map built during indexing. +// The map is materialized from the internal line index on first call and cached. +// +// Deprecated: use GetNode for single lookups; this method exists for API +// compatibility and allocates a full legacy map on first use. func (index *SpecIndex) GetNodeMap() map[int]map[int]*yaml.Node { - return index.nodeMap + index.awaitNodeMap() + index.nodeMapLock.Lock() + defer index.nodeMapLock.Unlock() + if index.legacyNodeMap != nil || index.nodeLines == nil { + return index.legacyNodeMap + } + legacy := make(map[int]map[int]*yaml.Node) + for line, entries := range index.nodeLines { + if len(entries) == 0 { + continue + } + cols := make(map[int]*yaml.Node, len(entries)) + for _, e := range entries { + cols[int(e.column)] = e.node + } + legacy[line] = cols + } + index.legacyNodeMap = legacy + return legacy } // GetCache returns the reference lookup cache used during resolution. @@ -543,7 +581,12 @@ func (index *SpecIndex) releaseComponentIndexes() { } func (index *SpecIndex) releaseDerivedState() { - index.nodeMap = nil + // node-map state is read concurrently via awaitNodeMap/GetNode; nil it + // under the same lock those readers use. + index.nodeMapLock.Lock() + index.nodeLines = nil + index.legacyNodeMap = nil + index.nodeMapLock.Unlock() index.allDescriptions = nil index.allSummaries = nil index.allEnums = nil @@ -603,7 +646,9 @@ func (index *SpecIndex) resetRuntimeState() { index.built = false index.componentIndexChan = nil index.polyComponentIndexChan = nil - index.nodeMapCompleted = nil + // nodeMapCompleted is deliberately NOT nilled: it is closed (retaining + // nothing) and awaitNodeMap reads the field without a lock on the GetNode + // hot path - writing nil here would race every reader for zero benefit. } // SetAbsolutePath sets the absolute path to the spec file for the index. Will be absolute, either as a http link or a file. diff --git a/index/index_model_test.go b/index/index_model_test.go index 2dafd301..26d8c8b2 100644 --- a/index/index_model_test.go +++ b/index/index_model_test.go @@ -90,6 +90,7 @@ func TestSpecIndexConfig_ToDocumentConfiguration_AllFields(t *testing.T) { AllowUnknownExtensionContentDetection: true, TransformSiblingRefs: true, ResolveNestedRefsWithDocumentContext: true, + SkipMetadataCollection: true, } result := config.ToDocumentConfiguration() @@ -107,6 +108,7 @@ func TestSpecIndexConfig_ToDocumentConfiguration_AllFields(t *testing.T) { assert.True(t, result.AllowUnknownExtensionContentDetection) assert.True(t, result.TransformSiblingRefs) assert.True(t, result.ResolveNestedRefsWithDocumentContext) + assert.True(t, result.SkipMetadataCollection) assert.False(t, result.MergeReferencedProperties) // default disabled for index configs } @@ -144,7 +146,8 @@ func TestSpecIndex_Release(t *testing.T) { rawSequencedRefs: []*Reference{{}}, allMappedRefs: map[string]*Reference{"mapped": {}}, allMappedRefsSequenced: []*ReferenceMapped{{}}, - nodeMap: map[int]map[int]*yaml.Node{1: {1: &yaml.Node{}}}, + nodeLines: [][]nodeLineEntry{nil, {{column: 1, node: &yaml.Node{}}}}, + legacyNodeMap: map[int]map[int]*yaml.Node{1: {1: &yaml.Node{}}}, allDescriptions: []*DescriptionReference{{}}, allEnums: []*EnumReference{{}}, circularReferences: []*CircularReferenceResult{{}}, @@ -181,8 +184,9 @@ func TestSpecIndex_Release(t *testing.T) { assert.Nil(t, idx.allMappedRefs) assert.Nil(t, idx.allMappedRefsSequenced) - // node map - assert.Nil(t, idx.nodeMap) + // node line index and legacy map + assert.Nil(t, idx.nodeLines) + assert.Nil(t, idx.legacyNodeMap) // descriptions, enums assert.Nil(t, idx.allDescriptions) @@ -209,7 +213,10 @@ func TestSpecIndex_Release(t *testing.T) { assert.False(t, idx.allowCircularReferences) assert.Nil(t, idx.componentIndexChan) assert.Nil(t, idx.polyComponentIndexChan) - assert.Nil(t, idx.nodeMapCompleted) + // nodeMapCompleted survives release on purpose: it is a closed channel + // (retains nothing) and awaitNodeMap reads it without a lock, so nilling + // it would race concurrent GetNode callers. + assert.NotNil(t, idx.nodeMapCompleted) // resolver released and niled assert.Nil(t, idx.resolver) diff --git a/index/inline_collector_parity_test.go b/index/inline_collector_parity_test.go new file mode 100644 index 00000000..b0a4c58f --- /dev/null +++ b/index/inline_collector_parity_test.go @@ -0,0 +1,91 @@ +// Copyright 2026 Princess B33f Heavy Industries / Dave Shanley +// SPDX-License-Identifier: MIT + +package index + +import ( + "crypto/sha256" + "encoding/hex" + "fmt" + "os" + "path/filepath" + "sort" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.yaml.in/yaml/v4" +) + +// inlineCollectorParityPath pins the exact (Definition, FullDefinition, Path) tuples +// produced by the inline schema collectors for real documents. The converter golden +// corpus (utils/testdata) protects the path converter; this fixture protects the +// string assembly inside the collectors themselves. Regenerate with: +// +// GOLDEN_REGENERATE=true go test ./index -run TestInlineCollectorParity +const inlineCollectorParityPath = "testdata/inline_collector_parity.txt" + +// parityAbsolutePath is a fixed absolute path so fixtures are machine independent. +const parityAbsolutePath = "/parity/test/root.yaml" + +var parityScanSpecs = []string{ + "../test_specs/stripe.yaml", + "../test_specs/burgershop.openapi.yaml", + "../test_specs/mixedref-burgershop.openapi.yaml", + "../test_specs/k8s.json", +} + +func collectInlineParityRows(t *testing.T) []string { + var rows []string + for _, spec := range parityScanSpecs { + data, err := os.ReadFile(spec) + require.NoError(t, err) + + var rootNode yaml.Node + require.NoError(t, yaml.Unmarshal(data, &rootNode)) + + cfg := CreateOpenAPIIndexConfig() + cfg.AllowRemoteLookup = false + cfg.AllowFileLookup = false + cfg.SpecAbsolutePath = parityAbsolutePath + idx := NewSpecIndexWithConfig(&rootNode, cfg) + + collections := []struct { + name string + refs []*Reference + }{ + {"inline", idx.GetAllInlineSchemas()}, + {"refs", idx.GetAllReferenceSchemas()}, + {"objects", idx.GetAllInlineSchemaObjects()}, + } + for _, c := range collections { + tuples := make([]string, 0, len(c.refs)) + for _, ref := range c.refs { + tuples = append(tuples, ref.Definition+"\x1f"+ref.FullDefinition+"\x1f"+ref.Path) + } + sort.Strings(tuples) + h := sha256.Sum256([]byte(strings.Join(tuples, "\n"))) + rows = append(rows, fmt.Sprintf("%s\t%s\t%d\t%s", + filepath.Base(spec), c.name, len(tuples), hex.EncodeToString(h[:]))) + } + } + return rows +} + +func TestInlineCollectorParity(t *testing.T) { + rows := collectInlineParityRows(t) + + if os.Getenv("GOLDEN_REGENERATE") == "true" { + require.NoError(t, os.MkdirAll("testdata", 0o755)) + require.NoError(t, os.WriteFile(inlineCollectorParityPath, + []byte(strings.Join(rows, "\n")+"\n"), 0o644)) + t.Logf("inline collector parity fixture regenerated: %d rows", len(rows)) + } + + expected, err := os.ReadFile(inlineCollectorParityPath) + require.NoError(t, err, "parity fixture missing - regenerate with GOLDEN_REGENERATE=true") + expectedText := strings.ReplaceAll(string(expected), "\r\n", "\n") + assert.Equal(t, strings.TrimRight(expectedText, "\n"), strings.Join(rows, "\n"), + "inline collector output changed - Definition/FullDefinition/Path must stay byte-identical") +} diff --git a/index/map_index_nodes.go b/index/map_index_nodes.go index 4f8ecde5..84cb1f10 100644 --- a/index/map_index_nodes.go +++ b/index/map_index_nodes.go @@ -41,44 +41,96 @@ type NodeOrigin struct { Index *SpecIndex `json:"-" yaml:"-"` } +// nodeLineEntry is a single (column, node) pair on one line of the spec. Lines hold very +// few nodes, so a small slice scanned linearly is far cheaper than a per-line map. +type nodeLineEntry struct { + column int32 + node *yaml.Node +} + // GetNode returns a node from the spec based on a line and column. The second return var bool is true -// if the node was found, false if not. +// if the node was found, false if not. Blocks until the node line index has been fully built. func (index *SpecIndex) GetNode(line int, column int) (*yaml.Node, bool) { + index.awaitNodeMap() index.nodeMapLock.RLock() defer index.nodeMapLock.RUnlock() - if index.nodeMap[line] == nil { - return nil, false - } - node := index.nodeMap[line][column] + node := lookupNodeLines(index.nodeLines, line, column) return node, node != nil } -// MapNodes maps all nodes in the document to a map of line/column to node. -// Writes directly to index.nodeMap with lock protection (concurrent reads -// may happen from ExtractRefs running in parallel). +// awaitNodeMap blocks until MapNodes has published the node line index. It is a no-op +// once the index has been built or released (the completion channel is close-only). +func (index *SpecIndex) awaitNodeMap() { + if ch := index.nodeMapCompleted; ch != nil { + <-ch + } +} + +// lookupNodeLines returns the node stored at line/column, or nil if absent. +func lookupNodeLines(lines [][]nodeLineEntry, line, column int) *yaml.Node { + if line < 0 || line >= len(lines) { + return nil + } + for _, e := range lines[line] { + if int(e.column) == column { + return e.node + } + } + return nil +} + +// MapNodes maps all nodes in the document by line and column. The index is built into a +// local structure without locking, published under a single lock, and completion is +// signalled by closing nodeMapCompleted (close-only: supports any number of waiters). func (index *SpecIndex) MapNodes(rootNode *yaml.Node) { - mapNodesRecursive(rootNode, index, true) - index.nodeMapCompleted <- struct{}{} + sizeHint := 0 + if index.config != nil && index.config.SpecInfo != nil { + sizeHint = index.config.SpecInfo.NumLines + } + // lines are 1-based; +1 so line NumLines is directly addressable. + lines := make([][]nodeLineEntry, sizeHint+1) + lines = mapNodesRecursive(rootNode, lines) + index.nodeMapLock.Lock() + index.nodeLines = lines + index.nodeMapLock.Unlock() close(index.nodeMapCompleted) } -func mapNodesRecursive(node *yaml.Node, index *SpecIndex, root bool) { +func mapNodesRecursive(node *yaml.Node, lines [][]nodeLineEntry) [][]nodeLineEntry { if node.Kind == yaml.DocumentNode { node = node.Content[0] } for _, child := range node.Content { - index.nodeMapLock.Lock() - if index.nodeMap[child.Line] == nil { - index.nodeMap[child.Line] = make(map[int]*yaml.Node) + lines = addNodeLineEntry(lines, child) + lines = mapNodesRecursive(child, lines) + } + return addNodeLineEntry(lines, node) +} + +// addNodeLineEntry records node at its line/column, preserving the previous map +// semantics: a later write to the same line/column replaces the earlier one +// (parents are written after their children, so parents win collisions). +func addNodeLineEntry(lines [][]nodeLineEntry, node *yaml.Node) [][]nodeLineEntry { + line := node.Line + if line < 0 { + return lines + } + if line >= len(lines) { + grown := len(lines) * 2 + if grown <= line { + grown = line + 1 } - index.nodeMap[child.Line][child.Column] = child - index.nodeMapLock.Unlock() - mapNodesRecursive(child, index, false) + expanded := make([][]nodeLineEntry, grown) + copy(expanded, lines) + lines = expanded } - index.nodeMapLock.Lock() - if index.nodeMap[node.Line] == nil { - index.nodeMap[node.Line] = make(map[int]*yaml.Node) + entries := lines[line] + for i := range entries { + if int(entries[i].column) == node.Column { + entries[i].node = node + return lines + } } - index.nodeMap[node.Line][node.Column] = node - index.nodeMapLock.Unlock() + lines[line] = append(entries, nodeLineEntry{column: int32(node.Column), node: node}) + return lines } diff --git a/index/map_index_nodes_test.go b/index/map_index_nodes_test.go index 1e9693d4..79e812ce 100644 --- a/index/map_index_nodes_test.go +++ b/index/map_index_nodes_test.go @@ -42,25 +42,27 @@ func TestSpecIndex_MapNodes(t *testing.T) { // check missing line var ok bool - mappedKeyNode, ok = index.GetNode(999, 999) + mappedKeyNode, ok = index.GetNode(999999, 999) assert.False(t, ok) assert.Nil(t, mappedKeyNode) + // check missing column on an existing line mappedKeyNode, ok = index.GetNode(12, 999) assert.False(t, ok) assert.Nil(t, mappedKeyNode) - index.nodeMap[15] = nil - mappedKeyNode, ok = index.GetNode(15, 999) + // check negative line + mappedKeyNode, ok = index.GetNode(-1, 1) assert.False(t, ok) assert.Nil(t, mappedKeyNode) } func TestSpecIndex_GetNode_MissDoesNotLeakReadLock(t *testing.T) { index := NewSpecIndexWithConfig(&yaml.Node{}, CreateOpenAPIIndexConfig()) - index.nodeMap = map[int]map[int]*yaml.Node{ - 1: {1: {Value: "ok"}}, - 2: nil, + index.nodeLines = [][]nodeLineEntry{ + nil, + {{column: 1, node: &yaml.Node{Value: "ok"}}}, + nil, } node, ok := index.GetNode(2, 1) @@ -70,7 +72,7 @@ func TestSpecIndex_GetNode_MissDoesNotLeakReadLock(t *testing.T) { locked := make(chan struct{}) go func() { index.nodeMapLock.Lock() - index.nodeMap[3] = map[int]*yaml.Node{} + index.nodeLines = append(index.nodeLines, nil) index.nodeMapLock.Unlock() close(locked) }() @@ -82,6 +84,88 @@ func TestSpecIndex_GetNode_MissDoesNotLeakReadLock(t *testing.T) { } } +func TestSpecIndex_MapNodes_LineZeroAndGrowth(t *testing.T) { + // a zero-value node reports line 0; nodes can also report lines beyond any + // preallocated hint. both must be stored and retrievable without panics. + lines := make([][]nodeLineEntry, 1) + zeroNode := &yaml.Node{} + lines = addNodeLineEntry(lines, zeroNode) + assert.Same(t, zeroNode, lookupNodeLines(lines, 0, 0)) + + farNode := &yaml.Node{Line: 500, Column: 3} + lines = addNodeLineEntry(lines, farNode) + assert.GreaterOrEqual(t, len(lines), 501) + assert.Same(t, farNode, lookupNodeLines(lines, 500, 3)) + + // a negative line is ignored, not stored. + negNode := &yaml.Node{Line: -1, Column: 1} + lines = addNodeLineEntry(lines, negNode) + assert.Nil(t, lookupNodeLines(lines, -1, 1)) + + // growth that doubles instead of exact-fit: line just past the end. + nearNode := &yaml.Node{Line: 501, Column: 9} + lines = addNodeLineEntry(lines, nearNode) + assert.Same(t, nearNode, lookupNodeLines(lines, 501, 9)) +} + +func TestSpecIndex_MapNodes_OverwriteSemantics(t *testing.T) { + // a later write to the same line/column replaces the earlier one — parents are + // written after children in mapNodesRecursive, so parents win collisions. + first := &yaml.Node{Line: 4, Column: 2, Value: "child"} + second := &yaml.Node{Line: 4, Column: 2, Value: "parent"} + + var lines [][]nodeLineEntry + lines = addNodeLineEntry(lines, first) + lines = addNodeLineEntry(lines, second) + + assert.Same(t, second, lookupNodeLines(lines, 4, 2)) + assert.Len(t, lines[4], 1) +} + +func TestSpecIndex_GetNodeMap_LegacyMaterialization(t *testing.T) { + petstore, _ := os.ReadFile("../test_specs/petstorev3.json") + var rootNode yaml.Node + _ = yaml.Unmarshal(petstore, &rootNode) + + index := NewSpecIndexWithConfig(&rootNode, CreateOpenAPIIndexConfig()) + + // the legacy map must never be materialized by internal build paths. + assert.Nil(t, index.legacyNodeMap) + + legacy := index.GetNodeMap() + assert.NotNil(t, legacy) + + // every entry in the legacy map must match the line index exactly. + total := 0 + for line, cols := range legacy { + for col, n := range cols { + total++ + found, ok := index.GetNode(line, col) + assert.True(t, ok) + assert.Same(t, n, found) + } + } + assert.Positive(t, total) + + // second call returns the cached map. + assert.Equal(t, reflect.ValueOf(legacy).Pointer(), reflect.ValueOf(index.GetNodeMap()).Pointer()) +} + +func TestSpecIndex_GetNodeMap_AfterRelease(t *testing.T) { + petstore, _ := os.ReadFile("../test_specs/petstorev3.json") + var rootNode yaml.Node + _ = yaml.Unmarshal(petstore, &rootNode) + + index := NewSpecIndexWithConfig(&rootNode, CreateOpenAPIIndexConfig()) + index.Release() + + // after release: no legacy map, no lookups, no blocking. + assert.Nil(t, index.GetNodeMap()) + node, ok := index.GetNode(1, 1) + assert.False(t, ok) + assert.Nil(t, node) +} + func BenchmarkSpecIndex_MapNodes(b *testing.B) { petstore, _ := os.ReadFile("../test_specs/petstorev3.json") var rootNode yaml.Node diff --git a/index/search_index.go b/index/search_index.go index 9e0c92d1..e0317116 100644 --- a/index/search_index.go +++ b/index/search_index.go @@ -171,47 +171,49 @@ func (index *SpecIndex) SearchIndexForReferenceByReferenceWithContext(ctx contex absPath = index.config.BasePath } var roloLookup string - uri := strings.Split(ref, "#/") - if len(uri) == 2 { - if uri[0] != "" { - if strings.HasPrefix(uri[0], "http") { + uriFile, uriFragment, uriCut := strings.Cut(ref, "#/") + // match strings.Split(ref, "#/") len==2 semantics: exactly one separator. + singleFragment := uriCut && !strings.Contains(uriFragment, "#/") + if singleFragment { + if uriFile != "" { + if strings.HasPrefix(uriFile, "http") { roloLookup = searchRef.FullDefinition } else { - if filepath.IsAbs(uri[0]) { - roloLookup = uri[0] + if filepath.IsAbs(uriFile) { + roloLookup = uriFile } else { if filepath.Ext(absPath) != "" { absPath = filepath.Dir(absPath) } - roloLookup = index.resolveRelativeFilePath(absPath, uri[0]) + roloLookup = index.resolveRelativeFilePath(absPath, uriFile) } } } else { - if filepath.Ext(uri[1]) != "" { + if filepath.Ext(uriFragment) != "" { roloLookup = absPath } else { roloLookup = "" } - ref = fmt.Sprintf("#/%s", uri[1]) - refAlt = fmt.Sprintf("%s#/%s", absPath, uri[1]) + ref = fmt.Sprintf("#/%s", uriFragment) + refAlt = fmt.Sprintf("%s#/%s", absPath, uriFragment) } } else { - if filepath.IsAbs(uri[0]) { - roloLookup = uri[0] + if filepath.IsAbs(uriFile) { + roloLookup = uriFile } else { - if strings.HasPrefix(uri[0], "http") { + if strings.HasPrefix(uriFile, "http") { roloLookup = ref } else { if filepath.Ext(absPath) != "" { absPath = filepath.Dir(absPath) } - roloLookup = index.resolveRelativeFilePath(absPath, uri[0]) + roloLookup = index.resolveRelativeFilePath(absPath, uriFile) } } - ref = uri[0] + ref = uriFile } if strings.Contains(ref, "%") { // decode the url. @@ -258,9 +260,7 @@ func (index *SpecIndex) SearchIndexForReferenceByReferenceWithContext(ctx contex // component-tree walking inside the remote file. if roloLookup != "" { - if strings.Contains(roloLookup, "#") { - roloLookup = strings.Split(roloLookup, "#")[0] - } + roloLookup, _, _ = strings.Cut(roloLookup, "#") b := filepath.Base(roloLookup) sfn := index.GetSpecFileName() @@ -270,8 +270,8 @@ func (index *SpecIndex) SearchIndexForReferenceByReferenceWithContext(ctx contex if b == sfn && roloLookup == abp { // if the reference is the same as the spec file name, we should look through the index for the component var r *Reference - if len(uri) == 2 { - r = index.FindComponentInRoot(ctx, fmt.Sprintf("#/%s", uri[1])) + if singleFragment { + r = index.FindComponentInRoot(ctx, fmt.Sprintf("#/%s", uriFragment)) } return r, index, ctx } @@ -352,12 +352,12 @@ func (index *SpecIndex) SearchIndexForReferenceByReferenceWithContext(ctx contex node, _ := rFile.GetContentAsYAMLNode() if node != nil { var found *Reference - exp := strings.Split(ref, "#/") + expFile, expFragment, expCut := strings.Cut(ref, "#/") compId := ref - if len(exp) == 2 { - compId = fmt.Sprintf("#/%s", exp[1]) - found = FindComponent(ctx, node, compId, exp[0], idx) + if expCut && !strings.Contains(expFragment, "#/") { + compId = fmt.Sprintf("#/%s", expFragment) + found = FindComponent(ctx, node, compId, expFile, idx) } if found == nil { found = idx.FindComponent(ctx, ref) diff --git a/index/search_rolodex.go b/index/search_rolodex.go index 6eb3f4bb..d61c6c50 100644 --- a/index/search_rolodex.go +++ b/index/search_rolodex.go @@ -97,51 +97,49 @@ func (r *Rolodex) FindNodeOrigin(node *yaml.Node) *NodeOrigin { // is returned, otherwise nil is returned. func (index *SpecIndex) FindNodeOrigin(node *yaml.Node) *NodeOrigin { if node != nil { + index.awaitNodeMap() index.nodeMapLock.RLock() - if index.nodeMap[node.Line] != nil { - if index.nodeMap[node.Line][node.Column] != nil { - foundNode := index.nodeMap[node.Line][node.Column] - match := false + if foundNode := lookupNodeLines(index.nodeLines, node.Line, node.Column); foundNode != nil { + match := false - if foundNode == node { - match = true + if foundNode == node { + match = true + } + + // if the found node is a map. iterate through the content until we locate the node at that position + if !match && (utils.IsNodeMap(foundNode) || + utils.IsNodeArray(foundNode)) && (utils.IsNodeMap(node) || utils.IsNodeArray(node)) { + if len(node.Content) == len(foundNode.Content) { + // hash node and found node + match = checkHash(node, foundNode) } + } else { + if !match { + // hash node and found node + match = checkHash(node, foundNode) - // if the found node is a map. iterate through the content until we locate the node at that position - if !match && (utils.IsNodeMap(foundNode) || - utils.IsNodeArray(foundNode)) && (utils.IsNodeMap(node) || utils.IsNodeArray(node)) { - if len(node.Content) == len(foundNode.Content) { - // hash node and found node - match = checkHash(node, foundNode) - } - } else { if !match { - // hash node and found node - match = checkHash(node, foundNode) - - if !match { - // check if the found node is a map and if the first item in the map - // has the same line and column, as well as the same value - if utils.IsNodeMap(foundNode) && len(foundNode.Content) > 0 { - if foundNode.Content[0].Line == node.Line && - foundNode.Content[0].Column == node.Column && - foundNode.Content[0].Value == node.Value { - match = true - } + // check if the found node is a map and if the first item in the map + // has the same line and column, as well as the same value + if utils.IsNodeMap(foundNode) && len(foundNode.Content) > 0 { + if foundNode.Content[0].Line == node.Line && + foundNode.Content[0].Column == node.Column && + foundNode.Content[0].Value == node.Value { + match = true } } } } + } - if match { - index.nodeMapLock.RUnlock() - return &NodeOrigin{ - Node: foundNode, - Line: node.Line, - Column: node.Column, - AbsoluteLocation: index.specAbsolutePath, - Index: index, - } + if match { + index.nodeMapLock.RUnlock() + return &NodeOrigin{ + Node: foundNode, + Line: node.Line, + Column: node.Column, + AbsoluteLocation: index.specAbsolutePath, + Index: index, } } } diff --git a/index/skip_metadata_collection_test.go b/index/skip_metadata_collection_test.go new file mode 100644 index 00000000..e6f0b761 --- /dev/null +++ b/index/skip_metadata_collection_test.go @@ -0,0 +1,83 @@ +// Copyright 2026 Princess B33f Heavy Industries / Dave Shanley +// SPDX-License-Identifier: MIT + +package index + +import ( + "os" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.yaml.in/yaml/v4" +) + +func buildSkipMetadataIndex(t *testing.T, skip bool) *SpecIndex { + data, err := os.ReadFile("../test_specs/burgershop.openapi.yaml") + require.NoError(t, err) + var rootNode yaml.Node + require.NoError(t, yaml.Unmarshal(data, &rootNode)) + + cfg := CreateOpenAPIIndexConfig() + cfg.AllowRemoteLookup = false + cfg.AllowFileLookup = false + cfg.SkipMetadataCollection = skip + return NewSpecIndexWithConfig(&rootNode, cfg) +} + +// TestSkipMetadataCollection_RefExtractionParity proves the flag only affects +// diagnostic metadata: reference extraction must be identical with it on or off. +func TestSkipMetadataCollection_RefExtractionParity(t *testing.T) { + full := buildSkipMetadataIndex(t, false) + skip := buildSkipMetadataIndex(t, true) + + fullRefs := full.GetRawReferencesSequenced() + skipRefs := skip.GetRawReferencesSequenced() + require.Equal(t, len(fullRefs), len(skipRefs)) + for i := range fullRefs { + assert.Equal(t, fullRefs[i].Definition, skipRefs[i].Definition) + assert.Equal(t, fullRefs[i].FullDefinition, skipRefs[i].FullDefinition) + } + + // inline schema collections are still gathered (the resolver searches them by + // FullDefinition), only their JSONPath Path values are skipped. + fullInline := full.GetAllInlineSchemas() + skipInline := skip.GetAllInlineSchemas() + require.Equal(t, len(fullInline), len(skipInline)) + require.NotEmpty(t, fullInline) + for i := range fullInline { + assert.Equal(t, fullInline[i].Definition, skipInline[i].Definition) + assert.Equal(t, fullInline[i].FullDefinition, skipInline[i].FullDefinition) + assert.NotEmpty(t, fullInline[i].Path) + assert.Empty(t, skipInline[i].Path) + } + + fullRefSchemas := full.GetAllReferenceSchemas() + skipRefSchemas := skip.GetAllReferenceSchemas() + require.Equal(t, len(fullRefSchemas), len(skipRefSchemas)) + + mapped := full.GetMappedReferences() + mappedSkip := skip.GetMappedReferences() + assert.Equal(t, len(mapped), len(mappedSkip)) +} + +// TestSkipMetadataCollection_MetadataEmpty proves all diagnostic collections are +// intentionally empty when the flag is enabled, and populated when it is not. +func TestSkipMetadataCollection_MetadataEmpty(t *testing.T) { + full := buildSkipMetadataIndex(t, false) + skip := buildSkipMetadataIndex(t, true) + + assert.NotEmpty(t, full.GetAllDescriptions()) + assert.NotEmpty(t, full.GetAllSummaries()) + assert.NotEmpty(t, full.GetAllEnums()) + assert.NotEmpty(t, full.GetAllObjectsWithProperties()) + assert.NotEmpty(t, full.GetSecurityRequirementReferences()) + assert.Positive(t, full.descriptionCount) + + assert.Empty(t, skip.GetAllDescriptions()) + assert.Empty(t, skip.GetAllSummaries()) + assert.Empty(t, skip.GetAllEnums()) + assert.Empty(t, skip.GetAllObjectsWithProperties()) + assert.Empty(t, skip.GetSecurityRequirementReferences()) + assert.Zero(t, skip.descriptionCount) +} diff --git a/index/spec_index_build.go b/index/spec_index_build.go index 9df192ec..544696d4 100644 --- a/index/spec_index_build.go +++ b/index/spec_index_build.go @@ -62,7 +62,6 @@ func createNewIndex(ctx context.Context, rootNode *yaml.Node, index *SpecIndex, return index } index.nodeMapCompleted = make(chan struct{}) - index.nodeMap = make(map[int]map[int]*yaml.Node) go index.MapNodes(rootNode) index.cache = new(sync.Map) diff --git a/index/testdata/inline_collector_parity.txt b/index/testdata/inline_collector_parity.txt new file mode 100644 index 00000000..25f919c7 --- /dev/null +++ b/index/testdata/inline_collector_parity.txt @@ -0,0 +1,12 @@ +stripe.yaml inline 15928 01204e05677a268ba71235a4157edcd4654b0a4be8f3eb3600547dbb0d75461e +stripe.yaml refs 2712 47652e79cef0fad8791d17c2c2fafe45462545d7a7cd9d0a539e14528b1cb7c9 +stripe.yaml objects 3857 4a21396e646e17a5a2cbca7614f28f40f8862f7c9722daeaab2036d35116c333 +burgershop.openapi.yaml inline 26 fd272026c992ecf45db265ba77226fbdb82913a5ccf3c2234e0f73a8b66496a0 +burgershop.openapi.yaml refs 24 f2cc7dc5c24330e66d8effd08c1aa6ebd69d10e77b883db6032c230b4e35801f +burgershop.openapi.yaml objects 5 cf82c2ab6a949c333ed0817e3579db7aba6b9c622a6934e64812f3a8d60852ad +mixedref-burgershop.openapi.yaml inline 6 e8c3af4d3b89d49c0b32305123e89455310fe43accfaaf28c7c32ffc99423bf0 +mixedref-burgershop.openapi.yaml refs 15 657348a49dce28aee9bcacb027d4328bab286254f4725d987825d14e4b51ef79 +mixedref-burgershop.openapi.yaml objects 2 6b25726d63946c707c2cb45151dc969770bc5f7ada3ac613ddb4647a4b33f15f +k8s.json inline 1934 f0aa6b35287accea6aeb00bd15ce4251e79648c2063172f5b893a45ac35185db +k8s.json refs 2533 c251e8a4ab708495abf5e473da7c1f6acdd4d1423d465d356d0849874eae6d3b +k8s.json objects 441 a79b0751a208f832200dd66d960bed3844849bd91660705f158f768c3b06c0da diff --git a/index/utility_methods.go b/index/utility_methods.go index 1e8fc191..d201dcac 100644 --- a/index/utility_methods.go +++ b/index/utility_methods.go @@ -555,16 +555,7 @@ func findIndex(index *SpecIndex, i *yaml.Node) *SpecIndex { } allIndexes := rolodex.GetIndexes() for _, searchIndex := range allIndexes { - nodeMap := searchIndex.GetNodeMap() - line, ok := nodeMap[i.Line] - if !ok { - continue - } - node, ok := line[i.Column] - if !ok { - continue - } - if node == i { + if node, ok := searchIndex.GetNode(i.Line, i.Column); ok && node == i { return searchIndex } } diff --git a/utils/component_id_golden_test.go b/utils/component_id_golden_test.go new file mode 100644 index 00000000..4d8c827f --- /dev/null +++ b/utils/component_id_golden_test.go @@ -0,0 +1,181 @@ +// Copyright 2026 Princess B33f Heavy Industries / Dave Shanley +// SPDX-License-Identifier: MIT + +package utils_test + +import ( + "bufio" + "compress/gzip" + "fmt" + "os" + "path/filepath" + "sort" + "strings" + "testing" + + "github.com/pb33f/libopenapi/index" + "github.com/pb33f/libopenapi/utils" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.yaml.in/yaml/v4" +) + +// goldenCorpusPath is the byte-for-byte contract for ConvertComponentIdIntoFriendlyPathSearch. +// The file is gzipped; each line is: inputnamepath. It is generated from real +// reference and schema definitions collected by indexing every spec in test_specs/, plus +// hand-picked edge cases, all run through the converter. Regenerate with: +// +// GOLDEN_REGENERATE=true go test ./utils -run TestComponentIdGoldenCorpus +const goldenCorpusPath = "testdata/component_id_golden.txt.gz" + +// goldenCorpusPerSpecCap bounds the number of unique inputs taken per spec so the +// fixture stays reviewable. Inputs are sorted before capping, so the selection is +// deterministic; the cap is recorded per spec during generation, never silent. +const goldenCorpusPerSpecCap = 4000 + +// goldenEdgeCaseInputs are converter inputs that exercise documented quirks regardless +// of whether any spec in test_specs happens to contain them. +var goldenEdgeCaseInputs = []string{ + "", + "#/", + "#/components/schemas/Burger", + "#/components/schemas/Burger/properties/fries", + "#/paths/~1burgers~1{burgerId}/get", + "#/paths/~1burgers/post/responses/200", + "#/components/schemas/List/items/0", + "#/components/schemas/List/items/100", + "#/components/schemas/403_permission_denied", + "#/components/schemas/some%20space", + "#/components/schemas/async_search.submit#wait_for_completion_timeout", + "#/definitions/Pet", + "document.yaml#/components/schemas/Thing", + "/absolute/path/file.yaml#/components/schemas/Thing", + "not-a-pointer", + "#/a/b/c/d/e/f/g/h", +} + +func collectGoldenCorpusInputs(t *testing.T) []string { + specDir := "../test_specs" + entries, err := os.ReadDir(specDir) + require.NoError(t, err) + + seen := make(map[string]struct{}) + for _, in := range goldenEdgeCaseInputs { + seen[in] = struct{}{} + } + + for _, entry := range entries { + if entry.IsDir() { + continue + } + ext := filepath.Ext(entry.Name()) + if ext != ".yaml" && ext != ".json" { + continue + } + data, rErr := os.ReadFile(filepath.Join(specDir, entry.Name())) + require.NoError(t, rErr) + + var rootNode yaml.Node + if yaml.Unmarshal(data, &rootNode) != nil { + continue // some fixtures are intentionally unparsable + } + if len(rootNode.Content) == 0 { + continue + } + + cfg := index.CreateOpenAPIIndexConfig() + cfg.AllowRemoteLookup = false + cfg.AllowFileLookup = false + idx := index.NewSpecIndexWithConfig(&rootNode, cfg) + + specInputs := make(map[string]struct{}) + for _, ref := range idx.GetRawReferencesSequenced() { + specInputs[ref.Definition] = struct{}{} + } + for _, ref := range idx.GetAllSequencedReferences() { + specInputs[ref.Definition] = struct{}{} + } + for _, ref := range idx.GetAllInlineSchemas() { + specInputs[ref.Definition] = struct{}{} + } + for _, ref := range idx.GetAllReferenceSchemas() { + specInputs[ref.Definition] = struct{}{} + } + for def := range idx.GetMappedReferences() { + specInputs[def] = struct{}{} + } + + ordered := make([]string, 0, len(specInputs)) + for in := range specInputs { + if strings.ContainsAny(in, "\t\n") { + continue // the fixture format is tab separated lines + } + ordered = append(ordered, in) + } + sort.Strings(ordered) + if len(ordered) > goldenCorpusPerSpecCap { + // stride-sample across the sorted range so the selection spans the + // whole spec rather than the alphabetically-first slice. + t.Logf("golden corpus: sampling %s from %d down to %d inputs", entry.Name(), len(ordered), goldenCorpusPerSpecCap) + sampled := make([]string, 0, goldenCorpusPerSpecCap) + stride := float64(len(ordered)) / float64(goldenCorpusPerSpecCap) + for i := 0; i < goldenCorpusPerSpecCap; i++ { + sampled = append(sampled, ordered[int(float64(i)*stride)]) + } + ordered = sampled + } + for _, in := range ordered { + seen[in] = struct{}{} + } + } + + all := make([]string, 0, len(seen)) + for in := range seen { + all = append(all, in) + } + sort.Strings(all) + return all +} + +func TestComponentIdGoldenCorpus(t *testing.T) { + if os.Getenv("GOLDEN_REGENERATE") == "true" { + inputs := collectGoldenCorpusInputs(t) + require.NoError(t, os.MkdirAll("testdata", 0o755)) + out, cErr := os.Create(goldenCorpusPath) + require.NoError(t, cErr) + gz := gzip.NewWriter(out) + for _, in := range inputs { + name, path := utils.ConvertComponentIdIntoFriendlyPathSearch(in) + _, _ = gz.Write([]byte(in)) + _, _ = gz.Write([]byte{'\t'}) + _, _ = gz.Write([]byte(name)) + _, _ = gz.Write([]byte{'\t'}) + _, _ = gz.Write([]byte(path)) + _, _ = gz.Write([]byte{'\n'}) + } + require.NoError(t, gz.Close()) + require.NoError(t, out.Close()) + t.Logf("golden corpus regenerated: %d inputs", len(inputs)) + } + + f, err := os.Open(goldenCorpusPath) + require.NoError(t, err, "golden corpus fixture missing - regenerate with GOLDEN_REGENERATE=true") + defer f.Close() + gzr, err := gzip.NewReader(f) + require.NoError(t, err) + defer gzr.Close() + + scanner := bufio.NewScanner(gzr) + scanner.Buffer(make([]byte, 1024*1024), 1024*1024) + lines := 0 + for scanner.Scan() { + lines++ + parts := strings.SplitN(scanner.Text(), "\t", 3) + require.Len(t, parts, 3, fmt.Sprintf("malformed golden corpus line %d", lines)) + name, path := utils.ConvertComponentIdIntoFriendlyPathSearch(parts[0]) + assert.Equal(t, parts[1], name, "name mismatch for input %q (line %d)", parts[0], lines) + assert.Equal(t, parts[2], path, "path mismatch for input %q (line %d)", parts[0], lines) + } + require.NoError(t, scanner.Err()) + assert.Greater(t, lines, 1000, "golden corpus suspiciously small") +} diff --git a/utils/testdata/component_id_golden.txt.gz b/utils/testdata/component_id_golden.txt.gz new file mode 100644 index 00000000..45a45de4 Binary files /dev/null and b/utils/testdata/component_id_golden.txt.gz differ diff --git a/utils/utils.go b/utils/utils.go index a7eb2ee8..f903ce9a 100644 --- a/utils/utils.go +++ b/utils/utils.go @@ -521,10 +521,10 @@ func FindKeyNode(key string, nodes []*yaml.Node) (keyNode *yaml.Node, valueNode } return NodeAlias(v), NodeAlias(nodes[i+1]) // next node is what we need. } - for x, j := range mergedNodeContent(v) { + content := mergedNodeContent(v) + for x, j := range content { if key == j.Value { if IsNodeMap(v) { - content := mergedNodeContent(v) if x+1 == len(content) { return NodeAlias(v), NodeAlias(content[x]) } @@ -532,7 +532,7 @@ func FindKeyNode(key string, nodes []*yaml.Node) (keyNode *yaml.Node, valueNode } if IsNodeArray(v) { - return NodeAlias(v), NodeAlias(mergedNodeContent(v)[x]) + return NodeAlias(v), NodeAlias(content[x]) } } } @@ -960,43 +960,6 @@ func isPathChar(s string) bool { return true } -func appendSegment(sb *strings.Builder, segs []string, cleaned []string, i int, wrapInQuotes bool) { - sb.Reset() - if wrapInQuotes { - sb.WriteString("['") - sb.WriteString(segs[i]) - sb.WriteString("']") - } else { - sb.WriteString("[") - sb.WriteString(segs[i]) - sb.WriteString("]") - } - c := sb.String() - sb.Reset() - sb.WriteString(cleaned[len(cleaned)-1]) - sb.WriteString(c) - cleaned[len(cleaned)-1] = sb.String() -} - -// appendSegmentOptimized uses strings.Builder more efficiently to avoid allocations -func appendSegmentOptimized(segs []string, cleaned []string, i int, wrapInQuotes bool) { - var builder strings.Builder - if wrapInQuotes { - builder.Grow(len(cleaned[len(cleaned)-1]) + len(segs[i]) + 4) // existing + [''] + segment - builder.WriteString(cleaned[len(cleaned)-1]) - builder.WriteString("['") - builder.WriteString(segs[i]) - builder.WriteString("']") - } else { - builder.Grow(len(cleaned[len(cleaned)-1]) + len(segs[i]) + 2) // existing + [] + segment - builder.WriteString(cleaned[len(cleaned)-1]) - builder.WriteByte('[') - builder.WriteString(segs[i]) - builder.WriteByte(']') - } - cleaned[len(cleaned)-1] = builder.String() -} - // parseSmallUint returns the unsigned integer value and true if s is a string of // digits representing a non-negative integer. Returns 0, false otherwise. func parseSmallUint(s string) (int, bool) { @@ -1016,15 +979,16 @@ func parseSmallUint(s string) (int, bool) { // ConvertComponentIdIntoFriendlyPathSearch will convert a JSON Path into a friendly path search string. // the friendliness comes from it being suitable for use with any JSON Path parser. // -// This function was re-written in v0.18.0 in order to fix a number of performance issues with the original -// implementation. Allocations were high and this function is used a lot, this new implementation is much -// lighter on string allocations by using a string builder. +// Rewritten as a single pass over the input with one output builder: no segment slice, +// no per-segment builders. Output is byte-identical to the previous implementation and +// is pinned by the golden corpus in testdata/component_id_golden.txt. func ConvertComponentIdIntoFriendlyPathSearch(id string) (string, string) { if id == "" || id == "#/" { return "", "$." } - segs := strings.Split(id, "/") - lastSeg := segs[len(segs)-1] + + // the name is the raw final segment, JSON-Pointer unescaped and URL decoded. + lastSeg := id[strings.LastIndexByte(id, '/')+1:] if strings.Contains(lastSeg, "~1") { lastSeg = strings.ReplaceAll(lastSeg, "~1", "/") } @@ -1033,151 +997,125 @@ func ConvertComponentIdIntoFriendlyPathSearch(id string) (string, string) { } name := lastSeg - // Pre-allocate with estimated capacity - estimatedCap := len(segs) + (len(segs) / 2) - cleaned := make([]string, 0, estimatedCap) + idContainsHash := strings.Contains(id, "#") + lastIndex := strings.Count(id, "/") // absolute index of the final segment - // check for strange spaces, chars and if found, wrap them up, clean them and create a new cleaned path. - for i := range segs { - if segs[i] == "" { + // note: we do NOT replace # with $ here. the leading # from JSON Pointer notation + // (e.g., "#/components/...") is dropped by the leading-segment rule, and any # + // characters within component names (e.g., "async_search.submit#wait_for_completion_timeout") + // should be preserved literally in the JSONPath query. see issue #485. + var b strings.Builder + b.Grow(len(id) + 16) + b.WriteString("$.") + elements := 0 + + // the plural-parent rule reads the previous segment as transformed by its own + // iteration (bracket-wrapped segments end in ']', stripped segments may shrink), + // so track what the previous segment ended up as, not what it started as. + prevNonEmpty := false + prevEndsInS := false + + pos := 0 + for i := 0; pos <= len(id); i++ { + var seg string + if next := strings.IndexByte(id[pos:], '/'); next >= 0 { + seg = id[pos : pos+next] + pos += next + 1 + } else { + seg = id[pos:] + pos = len(id) + 1 + } + + if seg == "" { + prevNonEmpty = false + prevEndsInS = false continue } - if !isPathChar(segs[i]) { + isLast := i == lastIndex - if strings.Contains(segs[i], "~1") { - segs[i] = strings.ReplaceAll(segs[i], "~1", "/") + if !isPathChar(seg) { + t := seg + if strings.Contains(t, "~1") { + t = strings.ReplaceAll(t, "~1", "/") } - if strings.ContainsRune(segs[i], '%') { - segs[i], _ = url.QueryUnescape(segs[i]) + if strings.ContainsRune(t, '%') { + t, _ = url.QueryUnescape(t) } - - // Use string builder for bracket wrapping - var bracketBuilder strings.Builder - bracketBuilder.Grow(len(segs[i]) + 4) - bracketBuilder.WriteString("['") - bracketBuilder.WriteString(segs[i]) - bracketBuilder.WriteString("']") - segs[i] = bracketBuilder.String() - - if len(cleaned) > 0 && i < len(segs)-1 { - // Use string builder for concatenation with last cleaned element - var concatBuilder strings.Builder - concatBuilder.Grow(len(cleaned[len(cleaned)-1]) + len(segs[i])) - concatBuilder.WriteString(cleaned[len(cleaned)-1]) - concatBuilder.WriteString(segs[i]) - cleaned[len(cleaned)-1] = concatBuilder.String() + // the transformed segment is bracket-wrapped: never empty, ends in ']'. + prevNonEmpty = true + prevEndsInS = false + if i == 0 && !isLast { + // a leading non-path segment (like the '#' of a JSON Pointer) is dropped. continue - } else { - if i > 0 && i < len(segs)-1 { - cleaned = append(cleaned, segs[i]) - continue - } - if i == len(segs)-1 { - l := len(cleaned) - if l > 0 { - // Use string builder for concatenation - var endBuilder strings.Builder - endBuilder.Grow(len(cleaned[l-1]) + len(segs[i])) - endBuilder.WriteString(cleaned[l-1]) - endBuilder.WriteString(segs[i]) - cleaned[l-1] = endBuilder.String() - } else { - cleaned = append(cleaned, segs[i]) - } - } } - } else { + if elements == 0 { + elements++ // becomes the first element, no separator needed + } + b.WriteString("['") + b.WriteString(t) + b.WriteString("']") + continue + } - // strip out any backslashes - if strings.Contains(id, "#") && strings.Contains(segs[i], `\`) { - segs[i] = strings.ReplaceAll(segs[i], `\`, "") - cleaned = append(cleaned, segs[i]) - continue + // strip out any backslashes + if idContainsHash && strings.Contains(seg, `\`) { + t := strings.ReplaceAll(seg, `\`, "") + if elements > 0 { + b.WriteByte('.') } + b.WriteString(t) + elements++ + prevNonEmpty = t != "" + prevEndsInS = t != "" && t[len(t)-1] == 's' + continue + } - intVal, isNum := parseSmallUint(segs[i]) - if isNum { + if intVal, isNum := parseSmallUint(seg); isNum { + // an index with no preceding element is dropped. + if elements > 0 { if intVal <= 99 { - if len(cleaned) > 0 { - appendSegmentOptimized(segs, cleaned, i, false) - } + b.WriteByte('[') + b.WriteString(seg) + b.WriteByte(']') } else { - if len(cleaned) > 0 { - appendSegmentOptimized(segs, cleaned, i, true) - } + b.WriteString("['") + b.WriteString(seg) + b.WriteString("']") } - continue } + prevNonEmpty = true + prevEndsInS = false + continue + } - // if we have a plural parent, wrap it in quotes. - if i > 0 && segs[i-1] != "" && segs[i-1][len(segs[i-1])-1] == 's' { - if i == 2 { // ignore first segment. - cleaned = append(cleaned, segs[i]) - continue + // if we have a plural parent, wrap it in quotes. + if i > 0 && prevNonEmpty && prevEndsInS { + prevNonEmpty = true + prevEndsInS = seg[len(seg)-1] == 's' + if i == 2 { // ignore first segment. + if elements > 0 { + b.WriteByte('.') } - - // Use string builder for plural wrapping - var pluralBuilder strings.Builder - pluralBuilder.Grow(len(cleaned[len(cleaned)-1]) + len(segs[i]) + 4) - pluralBuilder.WriteString(cleaned[len(cleaned)-1]) - pluralBuilder.WriteString("['") - pluralBuilder.WriteString(segs[i]) - pluralBuilder.WriteString("']") - cleaned[len(cleaned)-1] = pluralBuilder.String() + b.WriteString(seg) + elements++ continue } - - cleaned = append(cleaned, segs[i]) - } - } - - // use single string builder for final assembly. - // note: we do NOT replace # with $ here. the leading # from JSON Pointer notation - // (e.g., "#/components/...") is already stripped when we split by "/", and any # - // characters within component names (e.g., "async_search.submit#wait_for_completion_timeout") - // should be preserved literally in the JSONPath query. see issue #485. - var finalBuilder strings.Builder - if len(cleaned) > 1 { - // Estimate final size - totalLen := 0 - for _, seg := range cleaned { - totalLen += len(seg) + b.WriteString("['") + b.WriteString(seg) + b.WriteString("']") + continue } - finalBuilder.Grow(totalLen + len(cleaned) + 5) // segments + dots + $ + potential extra . - finalBuilder.WriteByte('$') - for i, segment := range cleaned { - if i > 0 { - finalBuilder.WriteByte('.') - } - finalBuilder.WriteString(segment) - } - } else { - // Handle single segment case - if len(cleaned) == 1 { - finalBuilder.Grow(len(cleaned[0]) + 5) - finalBuilder.WriteString("$.") - finalBuilder.WriteString(cleaned[0]) - } else { - finalBuilder.WriteString("$.") + if elements > 0 { + b.WriteByte('.') } + b.WriteString(seg) + elements++ + prevNonEmpty = true + prevEndsInS = seg[len(seg)-1] == 's' } - replaced := finalBuilder.String() - - // Ensure proper format - if len(replaced) > 0 { - if len(replaced) > 1 && replaced[1] != '.' { - // Insert period after $ - var dotBuilder strings.Builder - dotBuilder.Grow(len(replaced) + 1) - dotBuilder.WriteByte(replaced[0]) // $ - dotBuilder.WriteByte('.') // . - dotBuilder.WriteString(replaced[1:]) - replaced = dotBuilder.String() - } - } - return name, replaced + return name, b.String() } // ConvertComponentIdIntoPath will convert a JSON Path into a component ID diff --git a/utils/utils_test.go b/utils/utils_test.go index 21627e17..4f42ef24 100644 --- a/utils/utils_test.go +++ b/utils/utils_test.go @@ -1180,23 +1180,6 @@ func TestIsNodeRefValue_False(t *testing.T) { assert.Empty(t, val) } -// Tests for performance optimization coverage -func TestAppendSegment(t *testing.T) { - // Test appendSegment function (currently 0% coverage) - var sb strings.Builder - segs := []string{"test", "segment", "value"} - cleaned := []string{"initial"} - - // Test without quotes - appendSegment(&sb, segs, cleaned, 1, false) - assert.Equal(t, "initial[segment]", cleaned[0]) - - // Test with quotes - cleaned = []string{"another"} - appendSegment(&sb, segs, cleaned, 2, true) - assert.Equal(t, "another['value']", cleaned[0]) -} - func TestConvertComponentIdIntoFriendlyPathSearch_EdgeCases(t *testing.T) { // Test empty cleaned array handling _, path := ConvertComponentIdIntoFriendlyPathSearch("")