Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ require (
golang.org/x/sync v0.19.0
golang.org/x/sys v0.41.0
google.golang.org/grpc v1.80.0
google.golang.org/protobuf v1.36.11
oras.land/oras-go/v2 v2.6.0
)

Expand Down Expand Up @@ -164,7 +165,6 @@ require (
google.golang.org/api v0.214.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20260226221140-a57be14db171 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20260226221140-a57be14db171 // indirect
google.golang.org/protobuf v1.36.11 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
gopkg.in/warnings.v0 v0.1.2 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
Expand Down
4 changes: 4 additions & 0 deletions pkg/modelfile/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,10 @@ var (
"*.ftz", // FastText compressed model
"*.ark", // Kaldi ark format (speech/audio models)
"*.db", // Database files (LMDB, etc.)

// TensorFlow SavedModel literal-name files (no extension).
"feature_map", // TF SavedModel feature map definition
"checkpoint", // TF checkpoint pointer file (literal name)
}

// Code file patterns - supported script and notebook files.
Expand Down
9 changes: 9 additions & 0 deletions pkg/modelfile/constants_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,15 @@ func TestInferFileType(t *testing.T) {
{"at threshold", "borderline", WeightFileSizeThreshold, FileTypeCode},
// Just above threshold should be model
{"above threshold", "borderline", WeightFileSizeThreshold + 1, FileTypeModel},

// TF SavedModel literal-name files: must be MODEL even when 0 bytes,
// independent of the size heuristic that would otherwise classify them as CODE.
{"feature_map literal", "feature_map", 0, FileTypeModel},
{"feature_map small", "feature_map", 1024, FileTypeModel},
{"checkpoint literal small", "checkpoint", 32, FileTypeModel},
// Negative: the literal patterns must not match same-stem-different-extension files.
{"feature_map.json is config", "feature_map.json", 1024, FileTypeConfig},
{"checkpoint.bin is model via *.bin", "checkpoint.bin", 1024, FileTypeModel},
}

assert := assert.New(t)
Expand Down
58 changes: 58 additions & 0 deletions pkg/modelfile/modelfile.go
Original file line number Diff line number Diff line change
Expand Up @@ -346,13 +346,71 @@ func (mf *modelfile) generateByWorkspace(config *configmodelfile.GenerateConfig)
return err
}

// ONNX external_data post-processing: any tensor file referenced by an .onnx
// file via external_data.location is unconditionally a model weight file,
// regardless of its name or size. Walker may have classified small external
// tensor files as code/config/doc by extension/size heuristic; reclassify them.
mf.reclassifyONNXExternalData()

if mf.model.Size() == 0 && mf.code.Size() == 0 && mf.dataset.Size() == 0 {
return fmt.Errorf("no model/code/dataset found - you have to create the Modelfile by yourself")
}

return nil
}

// reclassifyONNXExternalData scans every .onnx file already in mf.model,
// extracts external_data.location paths, and moves those paths from whichever
// bucket the walker placed them in into mf.model.
//
// To avoid bypassing the walker's filtering (ExcludePatterns, isSkippable, file
// count / size limits, workspace boundary), this function ONLY reclassifies
// paths that are already present in one of the existing hashsets (config /
// code / doc / model). Paths that the walker excluded — including paths
// outside the workspace produced by a malformed `../` location — are silently
// ignored. ONNX parse failures degrade gracefully: a WARNING is printed and
// the affected .onnx's external tensors keep whatever classification the
// walker assigned (the pre-fix behavior).
func (mf *modelfile) reclassifyONNXExternalData() {
walkerCollected := func(rel string) bool {
return mf.model.Contains(rel) || mf.code.Contains(rel) ||
mf.config.Contains(rel) || mf.doc.Contains(rel)
}

for _, raw := range mf.model.Values() {
modelRel, ok := raw.(string)
if !ok || !strings.HasSuffix(strings.ToLower(modelRel), ".onnx") {
continue
}
onnxAbs := filepath.Join(mf.workspace, modelRel)
extPaths, err := ExtractONNXExternalDataPaths(onnxAbs)
if err != nil {
fmt.Fprintf(os.Stderr,
"WARNING: modelfile: failed to parse ONNX external_data from %s: %v "+
"-- external tensor files (if any) will keep walker-assigned classification\n",
modelRel, err)
continue
}
onnxDir := filepath.Dir(modelRel)
for _, ext := range extPaths {
relExt := filepath.Clean(filepath.Join(onnxDir, ext))
// Walker membership check absorbs all of:
// - exclude pattern (walker dropped it -> not in any bucket)
// - skippable directories (.git, etc.)
// - file count / size limits (walker errored before adding)
// - workspace boundary (walker never sees ../outside paths)
// - file simply doesn't exist on disk
if !walkerCollected(relExt) {
continue
}
mf.code.Remove(relExt)
mf.config.Remove(relExt)
mf.doc.Remove(relExt)
mf.model.Add(relExt)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve include/exclude filters for ONNX reclassification

The ONNX post-pass unconditionally does mf.model.Add(relExt) for every referenced tensor path, but this runs after the walk/filter phase and does not reapply GenerateConfig include/exclude rules. As a result, files that users explicitly excluded can be reintroduced into MODEL and then included in builds/uploads, which violates CLI filtering intent. Reclassification should only move paths that were already admitted by the workspace filter logic.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in 660ae9b. The walker-membership check is exactly the proposed fix: a path can only be reclassified if it survived the walker's filter pass — which already enforces ExcludePatterns, isSkippable, file count / size limits, and workspace boundary. So an --exclude'd tensor can never be re-added by the ONNX post-pass.

}
}
}

// generateByModelConfig generates the modelfile by the model config, such as config.json and generation_config.json.
func (mf *modelfile) generateByModelConfig() error {
// Get config map from json files. Collect all the keys and values from the config files
Expand Down
254 changes: 254 additions & 0 deletions pkg/modelfile/modelfile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -811,6 +811,111 @@ func TestNewModelfileByWorkspace(t *testing.T) {
expectCodes: []string{},
expectName: "selective-include",
},
{
// Mirrors screenshot 1 (TF SavedModel directory) plus the screenshots 4/5
// /home/admin/<ts>/{checkpoint,variables,...} layout: variables/ shards
// and a sibling checkpoint/ directory with model.ckpt-NNN.{data-*,index,meta}.
// Pre-fix, feature_map and the literal `checkpoint` files would have been
// classified as CODE because they have no extension and are small.
name: "tensorflow saved_model directory",
setupFiles: map[string]string{
"saved_model.pb": "",
"feature_map": "",
"tf_signature.txt": "",
"alps.meta": "",
"ais-sync.done": "",
"variables/checkpoint": "",
"variables/variables.index": "",
"variables/variables.data-00000-of-00002": "",
"variables/variables.data-00001-of-00002": "",
"checkpoint/checkpoint": "",
"checkpoint/model.ckpt-756921.index": "",
"checkpoint/model.ckpt-756921.meta": "",
"checkpoint/model.ckpt-756921.data-00000-of-00002": "",
"checkpoint/model.ckpt-756921.data-00001-of-00002": "",
},
config: &configmodelfile.GenerateConfig{
Name: "tf-savedmodel",
},
expectError: false,
// *.meta hits ConfigFilePatterns first (alps.meta, model.ckpt-*.meta).
expectConfigs: []string{
"alps.meta",
"checkpoint/model.ckpt-756921.meta",
},
expectModels: []string{
"saved_model.pb",
"feature_map",
"variables/checkpoint",
"variables/variables.index",
"variables/variables.data-00000-of-00002",
"variables/variables.data-00001-of-00002",
"checkpoint/checkpoint",
"checkpoint/model.ckpt-756921.index",
"checkpoint/model.ckpt-756921.data-00000-of-00002",
"checkpoint/model.ckpt-756921.data-00001-of-00002",
},
// ais-sync.done has no extension and is empty -> falls back to CODE by
// the size heuristic. Documented intentional behavior (user accepted).
expectCodes: []string{"ais-sync.done"},
expectDocs: []string{"tf_signature.txt"},
expectName: "tf-savedmodel",
},
{
// Mirrors screenshot 3/4/5: a parent directory containing base/<ts>/
// and delta/<ts>/ subdirs, each holding a complete TF SavedModel +
// checkpoint layout. Verifies the walker recurses correctly and that
// relative paths in the resulting Modelfile preserve the timestamp
// directory prefix so pull-side reconstruction restores the full tree.
name: "online-learning base plus delta tree",
setupFiles: map[string]string{
"base/20251202030708/saved_model.pb": "",
"base/20251202030708/feature_map": "",
"base/20251202030708/tf_signature.txt": "",
"base/20251202030708/alps.meta": "",
"base/20251202030708/ais-sync.done": "",
"base/20251202030708/variables/checkpoint": "",
"base/20251202030708/variables/variables.index": "",
"base/20251202030708/variables/variables.data-00000-of-00002": "",
"delta/20251202060205/saved_model.pb": "",
"delta/20251202060205/feature_map": "",
"delta/20251202060205/tf_signature.txt": "",
"delta/20251202060205/alps.meta": "",
"delta/20251202060205/ais-sync.done": "",
"delta/20251202060205/variables/checkpoint": "",
"delta/20251202060205/variables/variables.index": "",
"delta/20251202060205/variables/variables.data-00000-of-00002": "",
},
config: &configmodelfile.GenerateConfig{
Name: "online-learning",
},
expectError: false,
expectConfigs: []string{
"base/20251202030708/alps.meta",
"delta/20251202060205/alps.meta",
},
expectModels: []string{
"base/20251202030708/saved_model.pb",
"base/20251202030708/feature_map",
"base/20251202030708/variables/checkpoint",
"base/20251202030708/variables/variables.index",
"base/20251202030708/variables/variables.data-00000-of-00002",
"delta/20251202060205/saved_model.pb",
"delta/20251202060205/feature_map",
"delta/20251202060205/variables/checkpoint",
"delta/20251202060205/variables/variables.index",
"delta/20251202060205/variables/variables.data-00000-of-00002",
},
expectCodes: []string{
"base/20251202030708/ais-sync.done",
"delta/20251202060205/ais-sync.done",
},
expectDocs: []string{
"base/20251202030708/tf_signature.txt",
"delta/20251202060205/tf_signature.txt",
},
expectName: "online-learning",
},
}

assert := assert.New(t)
Expand Down Expand Up @@ -2125,3 +2230,152 @@ func min(a, b int64) int64 {
}
return b
}

// TestNewModelfileByWorkspace_ONNXExternalData mirrors screenshot 2: an ONNX
// directory where model.onnx references many extension-less tensor files via
// external_data. Pre-fix, small external tensor files were classified as CODE
// by the size heuristic; the ONNX post-processor must reclassify them all as
// MODEL deterministically — regardless of the file's size.
func TestNewModelfileByWorkspace_ONNXExternalData(t *testing.T) {
// Subset of the 36 extension-less files seen in screenshot 2. Names taken
// verbatim from the actual workflow_10062365 directory.
externalLocations := []string{
"tower_deep_layer_0_kernel_read__448_1",
"tower_shallow_layer_0_kernel_read__440_0",
"moe_layer_layer_0_kernel__399_15",
"moe_layer_layer_0_domain_bn_ExpandDims__400_16",
"moe_layer_search_layer_0_kernel__413_3",
"moe_layer_non_search_layer_0_kernel__427_9",
"feature_gate_main_kernel_read__352_27",
"feature_gate_main_domain_bn_ExpandDims__354_28",
"gated_dcn_layer_0_transform_kernel__385_21",
"conversion_layer_0_kernel_read__364_35",
"main_domain_embedding_Mark_output_user_main_domain_kernel_read__270_20",
"external_data_for_resource_handle",
}

tempDir, err := os.MkdirTemp("", "modelfile-onnx-*")
require.NoError(t, err)
defer os.RemoveAll(tempDir)

// Write the ONNX file with external_data references encoded into protobuf bytes.
require.NoError(t, os.WriteFile(
filepath.Join(tempDir, "model.onnx"),
buildMinimalONNXBytes(externalLocations),
0o644,
))

// Create each external tensor file as an empty placeholder.
for _, name := range externalLocations {
require.NoError(t, os.WriteFile(filepath.Join(tempDir, name), nil, 0o644))
}

// Sibling files from the screenshot.
require.NoError(t, os.WriteFile(filepath.Join(tempDir, "tf_signature.txt"), nil, 0o644))
require.NoError(t, os.WriteFile(filepath.Join(tempDir, "variable_sequence_def_0.onnx"), buildMinimalONNXBytes(nil), 0o644))

mf, err := NewModelfileByWorkspace(tempDir, &configmodelfile.GenerateConfig{
Name: "onnx-external-data",
Workspace: tempDir,
})
require.NoError(t, err)

models := mf.GetModels()
codes := mf.GetCodes()
configs := mf.GetConfigs()

// Every external_data location must appear in MODEL, regardless of file size.
for _, name := range externalLocations {
assert.Contains(t, models, name, "external_data tensor %q must be reclassified as MODEL", name)
assert.NotContains(t, codes, name, "external_data tensor %q must NOT be in CODE", name)
assert.NotContains(t, configs, name, "external_data tensor %q must NOT be in CONFIG", name)
}
// Both .onnx files themselves are MODELs by extension.
assert.Contains(t, models, "model.onnx")
assert.Contains(t, models, "variable_sequence_def_0.onnx")
// tf_signature.txt is a doc per default DocFilePatterns (*.txt).
assert.Contains(t, mf.GetDocs(), "tf_signature.txt")
}

// P1 coverage: an external_data.location of "../escape" must NOT promote
// anything outside the workspace. The walker never collected such a path, so
// it must be silently dropped from reclassification.
func TestNewModelfileByWorkspace_ONNXExternalDataPathTraversalIgnored(t *testing.T) {
tempDir := t.TempDir()

require.NoError(t, os.WriteFile(
filepath.Join(tempDir, "model.onnx"),
buildMinimalONNXBytes([]string{"../escape", "weights.bin"}),
0o644,
))
// weights.bin is a real, walker-collected file — should reclassify normally.
require.NoError(t, os.WriteFile(filepath.Join(tempDir, "weights.bin"), nil, 0o644))

mf, err := NewModelfileByWorkspace(tempDir, &configmodelfile.GenerateConfig{
Name: "onnx-traversal",
Workspace: tempDir,
})
require.NoError(t, err)

models := mf.GetModels()
assert.Contains(t, models, "weights.bin", "in-workspace external tensor must be reclassified")
for _, m := range models {
assert.NotContains(t, m, "..", "no model entry should escape the workspace")
assert.NotEqual(t, "../escape", m)
assert.NotEqual(t, "escape", m)
}
}

// P1 coverage: ExcludePatterns must not be silently overridden by ONNX
// reclassification. If the walker excluded a tensor file, it must remain
// excluded — even if model.onnx references it via external_data.
func TestNewModelfileByWorkspace_ONNXExternalDataRespectsExclude(t *testing.T) {
tempDir := t.TempDir()

require.NoError(t, os.WriteFile(
filepath.Join(tempDir, "model.onnx"),
buildMinimalONNXBytes([]string{"weights_keep.bin", "weights_drop.bin"}),
0o644,
))
require.NoError(t, os.WriteFile(filepath.Join(tempDir, "weights_keep.bin"), nil, 0o644))
require.NoError(t, os.WriteFile(filepath.Join(tempDir, "weights_drop.bin"), nil, 0o644))

mf, err := NewModelfileByWorkspace(tempDir, &configmodelfile.GenerateConfig{
Name: "onnx-exclude",
Workspace: tempDir,
ExcludePatterns: []string{"weights_drop.bin"},
})
require.NoError(t, err)

models := mf.GetModels()
assert.Contains(t, models, "weights_keep.bin")
assert.NotContains(t, models, "weights_drop.bin",
"excluded path must not be re-added by ONNX reclassification")
// And it must not have leaked into any other bucket either.
all := append(append(append([]string{}, mf.GetCodes()...), mf.GetConfigs()...), mf.GetDocs()...)
assert.NotContains(t, all, "weights_drop.bin")
}

// P3 coverage: a corrupted .onnx file must not crash generate. The .onnx
// itself stays in MODEL (it matches *.onnx), and any sibling tensor files
// keep whatever classification the walker already gave them — i.e., the
// pre-fix behavior. A WARNING is emitted to stderr (not asserted here, but
// captured via the parser's error path).
func TestNewModelfileByWorkspace_ONNXParseFailureFallsBack(t *testing.T) {
tempDir := t.TempDir()

// Corrupted bytes that look like ONNX but aren't valid wire format.
require.NoError(t, os.WriteFile(filepath.Join(tempDir, "model.onnx"), []byte{0xff, 0xff, 0xff, 0xff}, 0o644))
// A small extension-less file the walker would classify as CODE — stays CODE on parse failure.
require.NoError(t, os.WriteFile(filepath.Join(tempDir, "auxiliary_blob"), nil, 0o644))

mf, err := NewModelfileByWorkspace(tempDir, &configmodelfile.GenerateConfig{
Name: "onnx-corrupt",
Workspace: tempDir,
})
require.NoError(t, err, "corrupted ONNX must not abort generate")
assert.Contains(t, mf.GetModels(), "model.onnx", ".onnx file itself stays in MODEL by extension")
// auxiliary_blob keeps walker classification (CODE for small extension-less);
// we don't assert exact bucket — only that generate succeeded and produced output.
assert.NotEmpty(t, mf.GetModels())
}
Loading
Loading