From ba513148b804714243c3099eee6c2f4b31b9b6e7 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Wed, 9 Apr 2025 08:45:58 -0700 Subject: [PATCH 01/64] testing spec --- python/ctranslate2/converters/transformers.py | 6 +++++- python/ctranslate2/specs/attention_spec.py | 3 ++- python/ctranslate2/specs/common_spec.py | 12 ++++++++++++ python/ctranslate2/specs/transformer_spec.py | 10 ++++++---- python/ctranslate2/specs/whisper_spec.py | 7 ++++--- 5 files changed, 29 insertions(+), 9 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 2684dd2c7..06fa057d5 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -902,12 +902,13 @@ class WhisperLoader(BartLoader): def architecture_name(self): return "WhisperForConditionalGeneration" - def get_model_spec(self, model): + def get_model_spec(self, model, low_rank=False): spec = whisper_spec.WhisperSpec( model.config.encoder_layers, model.config.encoder_attention_heads, model.config.decoder_layers, model.config.decoder_attention_heads, + low_rank=low_rank, ) self.set_encoder(spec.encoder, model.model.encoder) @@ -996,6 +997,9 @@ def set_conv1d(self, spec, module): spec.weight = module.weight spec.bias = module.bias +class LiteWhisperLoader(WhisperLoader): + def get_model_spec(self, model): + return super().get_model_spec(model, low_rank=True) @register_loader("Wav2Vec2Config") class Wav2Vec2Loader(BartLoader): diff --git a/python/ctranslate2/specs/attention_spec.py b/python/ctranslate2/specs/attention_spec.py index f49d41121..cd44802d4 100644 --- a/python/ctranslate2/specs/attention_spec.py +++ b/python/ctranslate2/specs/attention_spec.py @@ -32,12 +32,13 @@ def __init__( num_heads_kv=None, head_dim=None, sliding_window=None, + low_rank=False, ): self.queries_scale = model_spec.OPTIONAL self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm) self.linear = [ - common_spec.LinearSpec() for _ in range(2 if self_attention else 3) + common_spec.LinearSpec() if not low_rank else common_spec.LowRankLinearSpec() for _ in range(2 if self_attention else 3) ] if relative_position: diff --git a/python/ctranslate2/specs/common_spec.py b/python/ctranslate2/specs/common_spec.py index 598a452d6..4e048ce2d 100644 --- a/python/ctranslate2/specs/common_spec.py +++ b/python/ctranslate2/specs/common_spec.py @@ -51,6 +51,18 @@ def __init__(self): def has_bias(self): return not isinstance(self.bias, str) +class LowRankLinearSpec(model_spec.LayerSpec): + def __init__(self): + super().__init__() + self.weight1 = None + self.weight2 = None + self.weight_scale = model_spec.OPTIONAL + self.weight_zero = model_spec.OPTIONAL + self.bias = model_spec.OPTIONAL + + def has_bias(self): + return not isinstance(self.bias, str) + class Conv1DSpec(model_spec.LayerSpec): def __init__(self): diff --git a/python/ctranslate2/specs/transformer_spec.py b/python/ctranslate2/specs/transformer_spec.py index 230e62cfd..4be7e9466 100644 --- a/python/ctranslate2/specs/transformer_spec.py +++ b/python/ctranslate2/specs/transformer_spec.py @@ -253,6 +253,7 @@ def __init__( rms_norm=False, num_heads_kv=None, sliding_window=None, + low_rank=False ): self.self_attention = attention_spec.MultiHeadAttentionSpec( self_attention=True, @@ -261,8 +262,9 @@ def __init__( rms_norm=rms_norm, num_heads_kv=num_heads_kv, sliding_window=sliding_window, + low_rank=low_rank, ) - self.ffn = FeedForwardSpec(glu=ffn_glu, rms_norm=rms_norm) + self.ffn = FeedForwardSpec(glu=ffn_glu, rms_norm=rms_norm, low_rank=low_rank) class TransformerDecoderLayerSpec(model_spec.LayerSpec): @@ -340,10 +342,10 @@ def __init__( class FeedForwardSpec(model_spec.LayerSpec): - def __init__(self, glu=False, rms_norm=False): + def __init__(self, glu=False, rms_norm=False, low_rank=False): self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm) - self.linear_0 = common_spec.LinearSpec() - self.linear_1 = common_spec.LinearSpec() + self.linear_0 = common_spec.LinearSpec() if not low_rank else common_spec.LowRankLinearSpec() + self.linear_1 = common_spec.LinearSpec() if not low_rank else common_spec.LowRankLinearSpec() if glu: self.linear_0_noact = common_spec.LinearSpec() diff --git a/python/ctranslate2/specs/whisper_spec.py b/python/ctranslate2/specs/whisper_spec.py index e32453e1c..a04909d3e 100644 --- a/python/ctranslate2/specs/whisper_spec.py +++ b/python/ctranslate2/specs/whisper_spec.py @@ -32,6 +32,7 @@ def __init__( num_encoder_heads, num_decoder_layers, num_decoder_heads, + low_rank=False, ): """Initializes the model specification. @@ -42,7 +43,7 @@ def __init__( num_decoder_heads: The number of decoder attention heads. """ super().__init__() - self.encoder = WhisperEncoderSpec(num_encoder_layers, num_encoder_heads) + self.encoder = WhisperEncoderSpec(num_encoder_layers, num_encoder_heads, low_rank) self.decoder = transformer_spec.TransformerDecoderSpec( num_decoder_layers, num_decoder_heads, @@ -66,12 +67,12 @@ def get_vocabulary_size(self): class WhisperEncoderSpec(model_spec.LayerSpec): - def __init__(self, num_layers, num_heads): + def __init__(self, num_layers, num_heads, low_rank=False): self.num_heads = np.dtype("int16").type(num_heads) self.conv1 = common_spec.Conv1DSpec() self.conv2 = common_spec.Conv1DSpec() self.position_encodings = transformer_spec.PositionEncoderSpec() self.layer_norm = common_spec.LayerNormSpec() self.layer = [ - transformer_spec.TransformerEncoderLayerSpec() for _ in range(num_layers) + transformer_spec.TransformerEncoderLayerSpec(low_rank=low_rank) for _ in range(num_layers) ] From 63cb8ff8a5d0a061d85d4333fce5be4554225a87 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Wed, 9 Apr 2025 22:12:43 -0700 Subject: [PATCH 02/64] testing spec --- python/ctranslate2/converters/transformers.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 06fa057d5..527c57735 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -997,7 +997,12 @@ def set_conv1d(self, spec, module): spec.weight = module.weight spec.bias = module.bias +@register_loader("LiteWhisperConfig") class LiteWhisperLoader(WhisperLoader): + @property + def architecture_name(self): + return "LiteWhisperForConditionalGeneration" + def get_model_spec(self, model): return super().get_model_spec(model, low_rank=True) From 24fe394363d7c5939ba771e49e49d932b629c430 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Wed, 9 Apr 2025 22:48:52 -0700 Subject: [PATCH 03/64] added test --- python/tests/test_transformers.py | 37 +++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index 1fed8196d..1aad2a836 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -832,6 +832,43 @@ def _get_features(audio): transcription = processor.decode(token_ids) assert transcription == expected_transcription + # @test_utils.only_on_linux + @test_utils.on_available_devices + def test_transformers_lite_whisper(tmpdir, device): + import transformers + + model_name = "efficient-speech/lite-whisper-tiny" + converter = transformers.converters.TransformersConverter(model_name, trust_remote_code=True) + output_dir = converter.convert('./ctranslate2_model') + + audio_path = os.path.join(test_utils.get_data_dir(), "audio", "mr_quilter.npy") + audio = np.load(audio_path) + + processor = transformers.WhisperProcessor.from_pretrained(model_name) + inputs = processor(audio, return_tensors="np", sampling_rate=16000) + features = ctranslate2.StorageView.from_array(inputs.input_features) + + model = ctranslate2.models.WhisperModel.from_path(output_dir, device=device) + + results = model.detect_language(features) + best_lang, best_prob = results[0][0] + assert best_lang == "<|en|>" + assert best_prob > 0.9 + + prompt = processor.get_decoder_prompt_ids(language="en", task="transcribe") + prompt = [token for _, token in prompt] + + results = model.generate(features, [prompt]) + + transcription = processor.decode( + results[0].sequences_ids[0], skip_special_tokens=True + ) + assert transcription == ( + " Mr. Quilter is the apostle of the middle classes " + "and we are glad to welcome his gospel." + ) + + @test_utils.only_on_linux @test_utils.on_available_devices @pytest.mark.parametrize( From d2585776d49035356ed019c9d0aff180ae3bf9ad Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Wed, 9 Apr 2025 23:14:42 -0700 Subject: [PATCH 04/64] update --- python/tests/test_transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index 1aad2a836..0a47c5967 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -832,7 +832,7 @@ def _get_features(audio): transcription = processor.decode(token_ids) assert transcription == expected_transcription - # @test_utils.only_on_linux + @test_utils.only_on_linux @test_utils.on_available_devices def test_transformers_lite_whisper(tmpdir, device): import transformers From 607a649332f194bf18c81d46d815695cd62ad018 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Wed, 9 Apr 2025 23:20:07 -0700 Subject: [PATCH 05/64] fix --- python/tests/test_transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index 0a47c5967..82235c4d9 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -838,7 +838,7 @@ def test_transformers_lite_whisper(tmpdir, device): import transformers model_name = "efficient-speech/lite-whisper-tiny" - converter = transformers.converters.TransformersConverter(model_name, trust_remote_code=True) + converter = ctranslate2.converters.TransformersConverter(model_name, trust_remote_code=True) output_dir = converter.convert('./ctranslate2_model') audio_path = os.path.join(test_utils.get_data_dir(), "audio", "mr_quilter.npy") From cf840ad6da0e65796ac86cd83c98b4bfb3970782 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Fri, 11 Apr 2025 08:02:32 -0700 Subject: [PATCH 06/64] loading fixes --- python/ctranslate2/converters/transformers.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 527c57735..1dbd30ebc 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -96,6 +96,7 @@ def __init__( trust_remote_code: Allow converting models using custom code. """ self._model_name_or_path = model_name_or_path + self._model_processor_name = (model_name_or_path if not model_name_or_path.startswith('efficient-speech/lite-whisper') else 'openai/whisper-large-v3') self._activation_scales = activation_scales self._copy_files = copy_files self._load_as_float16 = load_as_float16 @@ -119,9 +120,6 @@ def _load(self): % (config_name, ", ".join(sorted(_MODEL_LOADERS.keys()))) ) - model_class = getattr(transformers, loader.architecture_name) - tokenizer_class = transformers.AutoTokenizer - kwargs = { "torch_dtype": ( torch.float16 @@ -137,14 +135,19 @@ def _load(self): if self._trust_remote_code: kwargs["trust_remote_code"] = self._trust_remote_code - model = self.load_model(model_class, self._model_name_or_path, **kwargs) + if hasattr(transformers, loader.architecture_name): + model_class = getattr(transformers, loader.architecture_name) + model = self.load_model(model_class, self._model_name_or_path, **kwargs) + else: + model = transformers.AutoModel.from_pretrained(self._model_name_or_path, **kwargs) tokenizer_kwargs = {} if self._trust_remote_code: tokenizer_kwargs["trust_remote_code"] = self._trust_remote_code + tokenizer_class = transformers.AutoTokenizer tokenizer = self.load_tokenizer( - tokenizer_class, self._model_name_or_path, **tokenizer_kwargs + tokenizer_class, self._model_processor_name, **tokenizer_kwargs ) spec = loader(model, tokenizer) From 0412e6febd9caf43dfb25912d32c6093376c4816 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Mon, 14 Apr 2025 00:06:57 -0700 Subject: [PATCH 07/64] fix --- python/ctranslate2/converters/transformers.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 1dbd30ebc..4f1aa99ec 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -240,6 +240,19 @@ def set_linear(self, spec, module, quant_type=common_spec.Quantization.CT2): spec.weight = spec.weight.transpose(0, 1) if module.bias is not None: spec.bias = module.bias + + def set_low_rank_linear(self, spec, module, quant_type=common_spec.Quantization.CT2): + if quant_type == common_spec.Quantization.CT2: + spec.weight1 = module.weight1 + spec.weight2 = module.weight2 + else: + spec.weight1 = module.qweight1 + spec.weight2 = module.qweight2 + spec.weight_scale = module.scales + spec.weight_zero = module.qzeros + + if module.bias is not None: + spec.bias = module.bias def set_embeddings(self, spec, module): spec.weight = module.weight From 7dbd2d840aee15512a228fc22e0e4b164c3b3c8a Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Mon, 14 Apr 2025 08:40:44 -0700 Subject: [PATCH 08/64] fixed --- python/ctranslate2/converters/transformers.py | 48 +++++++++++++------ python/ctranslate2/converters/utils.py | 34 +++++++++++++ 2 files changed, 68 insertions(+), 14 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 4f1aa99ec..a34609167 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -312,7 +312,7 @@ def set_config(self, config, model, tokenizer): model.config.decoder_start_token_id ) - def set_encoder(self, spec, encoder): + def set_encoder(self, spec, encoder, low_rank=False): self.set_common_layers(spec, encoder) for layer_spec, layer in zip(spec.layer, encoder.layers): @@ -320,14 +320,19 @@ def set_encoder(self, spec, encoder): layer_spec.self_attention, layer.self_attn, self_attention=True, + low_rank=low_rank, ) self.set_layer_norm( layer_spec.self_attention.layer_norm, layer.self_attn_layer_norm, ) - self.set_linear(layer_spec.ffn.linear_0, layer.fc1) - self.set_linear(layer_spec.ffn.linear_1, layer.fc2) + if low_rank: + self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) + self.set_low_rank_linear(layer_spec.ffn.linear_1, layer.fc2) + else: + self.set_linear(layer_spec.ffn.linear_0, layer.fc1) + self.set_linear(layer_spec.ffn.linear_1, layer.fc2) self.set_layer_norm(layer_spec.ffn.layer_norm, layer.final_layer_norm) def set_decoder(self, spec, decoder): @@ -359,17 +364,32 @@ def set_decoder(self, spec, decoder): self.set_linear(layer_spec.ffn.linear_1, layer.fc2) self.set_layer_norm(layer_spec.ffn.layer_norm, layer.final_layer_norm) - def set_attention(self, spec, attention, self_attention=False): - split_layers = [common_spec.LinearSpec() for _ in range(3)] - self.set_linear(split_layers[0], attention.q_proj) - self.set_linear(split_layers[1], attention.k_proj) - self.set_linear(split_layers[2], attention.v_proj) + def set_attention(self, spec, attention, self_attention=False, low_rank=False): + split_layers = [ + (common_spec.LowRankLinearSpec() if low_rank else common_spec.LinearSpec()) + for _ in range(3) + ] + if low_rank: + self.set_low_rank_linear(split_layers[0], attention.q_proj) + self.set_low_rank_linear(split_layers[1], attention.k_proj) + self.set_low_rank_linear(split_layers[2], attention.v_proj) + else: + self.set_linear(split_layers[0], attention.q_proj) + self.set_linear(split_layers[1], attention.k_proj) + self.set_linear(split_layers[2], attention.v_proj) if self_attention: - utils.fuse_linear(spec.linear[0], split_layers) + if low__rank: + utils.fuse_low_rank_linear(spec.linear[0], split_layers) + else: + utils.fuse_linear(spec.linear[0], split_layers) else: - utils.fuse_linear(spec.linear[0], split_layers[:1]) - utils.fuse_linear(spec.linear[1], split_layers[1:]) + if low_rank: + utils.fuse_low_rank_linear(spec.linear[0], split_layers[:1]) + utils.fuse_low_rank_linear(spec.linear[1], split_layers[1:]) + else: + utils.fuse_linear(spec.linear[0], split_layers[:1]) + utils.fuse_linear(spec.linear[1], split_layers[1:]) self.set_linear(spec.linear[-1], attention.out_proj) @@ -927,7 +947,7 @@ def get_model_spec(self, model, low_rank=False): low_rank=low_rank, ) - self.set_encoder(spec.encoder, model.model.encoder) + self.set_encoder(spec.encoder, model.model.encoder, low_rank=low_rank) self.set_decoder(spec.decoder, model.model.decoder) self.set_linear(spec.decoder.projection, model.proj_out) @@ -996,10 +1016,10 @@ def get_vocabulary(self, model, tokenizer): def set_vocabulary(self, spec, tokens): spec.register_vocabulary(tokens) - def set_encoder(self, spec, encoder): + def set_encoder(self, spec, encoder, low_rank=False): self.set_conv1d(spec.conv1, encoder.conv1) self.set_conv1d(spec.conv2, encoder.conv2) - super().set_encoder(spec, encoder) + super().set_encoder(spec, encoder, low_rank=low_rank) def set_decoder(self, spec, decoder): self.set_embeddings(spec.embeddings, decoder.embed_tokens) diff --git a/python/ctranslate2/converters/utils.py b/python/ctranslate2/converters/utils.py index c744ff925..2262ec308 100644 --- a/python/ctranslate2/converters/utils.py +++ b/python/ctranslate2/converters/utils.py @@ -35,6 +35,40 @@ def fuse_linear(spec, layers): ) +def fuse_low_rank_linear(spec, layers): + if not layers: + raise ValueError("Cannot fuse low rank linear layers: at least one layer is required") + + if isinstance(layers[0].weight1, np.ndarray): + concatenate = np.concatenate + zeros = np.zeros + else: + import torch + + concatenate = torch.cat + zeros = torch.zeros + + spec.weight1 = concatenate([layer.weight1 for layer in layers]) + spec.weight2 = concatenate([layer.weight2 for layer in layers]) + + bias_dtype = None + for layer in layers: + if layer.has_bias(): + bias_dtype = layer.bias.dtype + break + + if bias_dtype is not None: + spec.bias = concatenate( + [ + ( + layer.bias + if layer.has_bias() + else zeros([layer.weight1.shape[0]], dtype=bias_dtype) + ) + for layer in layers + ] + ) + def fuse_linear_prequant(spec, layers, axis): if not layers: raise ValueError("Cannot fuse linear layers: at least one layer is required") From a5ac7cdfa2bd29d0af10d86bc046a57f7dc7cf59 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Mon, 14 Apr 2025 08:51:55 -0700 Subject: [PATCH 09/64] fixed --- python/ctranslate2/converters/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index a34609167..a48bc091c 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -379,7 +379,7 @@ def set_attention(self, spec, attention, self_attention=False, low_rank=False): self.set_linear(split_layers[2], attention.v_proj) if self_attention: - if low__rank: + if low_rank: utils.fuse_low_rank_linear(spec.linear[0], split_layers) else: utils.fuse_linear(spec.linear[0], split_layers) From a667e1ebc1af0c067f6522b8a5e01baf6daf8033 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Mon, 14 Apr 2025 17:51:44 -0700 Subject: [PATCH 10/64] test --- python/ctranslate2/converters/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/utils.py b/python/ctranslate2/converters/utils.py index 2262ec308..197d95a22 100644 --- a/python/ctranslate2/converters/utils.py +++ b/python/ctranslate2/converters/utils.py @@ -47,7 +47,7 @@ def fuse_low_rank_linear(spec, layers): concatenate = torch.cat zeros = torch.zeros - + print(layers) spec.weight1 = concatenate([layer.weight1 for layer in layers]) spec.weight2 = concatenate([layer.weight2 for layer in layers]) From 9220303da5bd7cfbead54001cc5e45960ec1c361 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Mon, 14 Apr 2025 17:53:13 -0700 Subject: [PATCH 11/64] test --- python/ctranslate2/converters/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/utils.py b/python/ctranslate2/converters/utils.py index 197d95a22..31a2613e2 100644 --- a/python/ctranslate2/converters/utils.py +++ b/python/ctranslate2/converters/utils.py @@ -47,7 +47,7 @@ def fuse_low_rank_linear(spec, layers): concatenate = torch.cat zeros = torch.zeros - print(layers) + print('layer values!!: ', layers) spec.weight1 = concatenate([layer.weight1 for layer in layers]) spec.weight2 = concatenate([layer.weight2 for layer in layers]) From 80e6b7ba89b1c9b414bf2d592dac006d3083f653 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Mon, 14 Apr 2025 17:55:26 -0700 Subject: [PATCH 12/64] test --- python/ctranslate2/converters/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/utils.py b/python/ctranslate2/converters/utils.py index 31a2613e2..f7d14d1d3 100644 --- a/python/ctranslate2/converters/utils.py +++ b/python/ctranslate2/converters/utils.py @@ -47,7 +47,7 @@ def fuse_low_rank_linear(spec, layers): concatenate = torch.cat zeros = torch.zeros - print('layer values!!: ', layers) + raise ValueError(f"Layers!!: {layers}") spec.weight1 = concatenate([layer.weight1 for layer in layers]) spec.weight2 = concatenate([layer.weight2 for layer in layers]) From e501642a5774b0bcc6b23bbd82719c233c22ef68 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Mon, 14 Apr 2025 17:56:50 -0700 Subject: [PATCH 13/64] test --- python/ctranslate2/converters/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/utils.py b/python/ctranslate2/converters/utils.py index f7d14d1d3..0d2d1109f 100644 --- a/python/ctranslate2/converters/utils.py +++ b/python/ctranslate2/converters/utils.py @@ -47,7 +47,7 @@ def fuse_low_rank_linear(spec, layers): concatenate = torch.cat zeros = torch.zeros - raise ValueError(f"Layers!!: {layers}") + raise ValueError(f"Layers!!: {[len(layer.weight1) for layer in layers]}") spec.weight1 = concatenate([layer.weight1 for layer in layers]) spec.weight2 = concatenate([layer.weight2 for layer in layers]) From e17f23f18ffa43782015bcda0763ad6478b872f8 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Mon, 14 Apr 2025 17:58:05 -0700 Subject: [PATCH 14/64] test --- python/ctranslate2/converters/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/utils.py b/python/ctranslate2/converters/utils.py index 0d2d1109f..ff91d4126 100644 --- a/python/ctranslate2/converters/utils.py +++ b/python/ctranslate2/converters/utils.py @@ -47,7 +47,7 @@ def fuse_low_rank_linear(spec, layers): concatenate = torch.cat zeros = torch.zeros - raise ValueError(f"Layers!!: {[len(layer.weight1) for layer in layers]}") + print(f"Layers!!: {[len(layer.weight1) for layer in layers]}") spec.weight1 = concatenate([layer.weight1 for layer in layers]) spec.weight2 = concatenate([layer.weight2 for layer in layers]) From eb830921db7e5c400b02c867c6ddcbc5b9f76f9a Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Mon, 14 Apr 2025 23:45:03 -0700 Subject: [PATCH 15/64] test --- python/ctranslate2/converters/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/utils.py b/python/ctranslate2/converters/utils.py index ff91d4126..bf821aeb3 100644 --- a/python/ctranslate2/converters/utils.py +++ b/python/ctranslate2/converters/utils.py @@ -47,7 +47,7 @@ def fuse_low_rank_linear(spec, layers): concatenate = torch.cat zeros = torch.zeros - print(f"Layers!!: {[len(layer.weight1) for layer in layers]}") + print(f"Layers!!: {[layer.weight1.shape for layer in layers]}") spec.weight1 = concatenate([layer.weight1 for layer in layers]) spec.weight2 = concatenate([layer.weight2 for layer in layers]) From f786c6d5bcc8fddc3bed3e05e074958d4c281bde Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Thu, 17 Apr 2025 00:57:43 -0700 Subject: [PATCH 16/64] test_axis_1 --- .vscode/settings.json | 5 +++++ python/ctranslate2/converters/utils.py | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..a3b2b51f1 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "files.associations": { + "vector": "cpp" + } +} \ No newline at end of file diff --git a/python/ctranslate2/converters/utils.py b/python/ctranslate2/converters/utils.py index bf821aeb3..17513ad4a 100644 --- a/python/ctranslate2/converters/utils.py +++ b/python/ctranslate2/converters/utils.py @@ -47,9 +47,9 @@ def fuse_low_rank_linear(spec, layers): concatenate = torch.cat zeros = torch.zeros - print(f"Layers!!: {[layer.weight1.shape for layer in layers]}") - spec.weight1 = concatenate([layer.weight1 for layer in layers]) - spec.weight2 = concatenate([layer.weight2 for layer in layers]) + + spec.weight1 = concatenate([layer.weight1 for layer in layers], axis=1) + spec.weight2 = concatenate([layer.weight2 for layer in layers], axis=1) bias_dtype = None for layer in layers: From e7cfdad5cab5a3b0c075751adbd611e36956d067 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Thu, 17 Apr 2025 01:00:05 -0700 Subject: [PATCH 17/64] test_axis_1 --- python/ctranslate2/converters/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/utils.py b/python/ctranslate2/converters/utils.py index 17513ad4a..3a2c8bdaf 100644 --- a/python/ctranslate2/converters/utils.py +++ b/python/ctranslate2/converters/utils.py @@ -47,7 +47,7 @@ def fuse_low_rank_linear(spec, layers): concatenate = torch.cat zeros = torch.zeros - + print(f"fuse_low_rank_linear: {[layer.weight1.shape for layer in layers]}") spec.weight1 = concatenate([layer.weight1 for layer in layers], axis=1) spec.weight2 = concatenate([layer.weight2 for layer in layers], axis=1) From a6a4c4217c28c25a408f7d3abaf003fd79aee3cd Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Thu, 17 Apr 2025 01:01:29 -0700 Subject: [PATCH 18/64] test_axis_1 --- python/ctranslate2/converters/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/utils.py b/python/ctranslate2/converters/utils.py index 3a2c8bdaf..b4958f4d1 100644 --- a/python/ctranslate2/converters/utils.py +++ b/python/ctranslate2/converters/utils.py @@ -47,7 +47,8 @@ def fuse_low_rank_linear(spec, layers): concatenate = torch.cat zeros = torch.zeros - print(f"fuse_low_rank_linear: {[layer.weight1.shape for layer in layers]}") + print(f"fuse_low_rank_linear 1: {[layer.weight1.shape for layer in layers]}") + print(f"fuse_low_rank_linear 2: {[layer.weight1.shape for layer in layers]}") spec.weight1 = concatenate([layer.weight1 for layer in layers], axis=1) spec.weight2 = concatenate([layer.weight2 for layer in layers], axis=1) From ff9a3af33ef319e12eae5c66cf5ec1947317db2e Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Thu, 17 Apr 2025 01:02:22 -0700 Subject: [PATCH 19/64] test_axis_1 --- python/ctranslate2/converters/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/utils.py b/python/ctranslate2/converters/utils.py index b4958f4d1..0b679156e 100644 --- a/python/ctranslate2/converters/utils.py +++ b/python/ctranslate2/converters/utils.py @@ -48,7 +48,7 @@ def fuse_low_rank_linear(spec, layers): concatenate = torch.cat zeros = torch.zeros print(f"fuse_low_rank_linear 1: {[layer.weight1.shape for layer in layers]}") - print(f"fuse_low_rank_linear 2: {[layer.weight1.shape for layer in layers]}") + print(f"fuse_low_rank_linear 2: {[layer.weight2.shape for layer in layers]}") spec.weight1 = concatenate([layer.weight1 for layer in layers], axis=1) spec.weight2 = concatenate([layer.weight2 for layer in layers], axis=1) From b8d0c7ab3e18a6908f512d8cabeb901130f2924c Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Thu, 17 Apr 2025 01:35:07 -0700 Subject: [PATCH 20/64] test_axis_1 --- python/ctranslate2/converters/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/utils.py b/python/ctranslate2/converters/utils.py index 0b679156e..ec56e98ca 100644 --- a/python/ctranslate2/converters/utils.py +++ b/python/ctranslate2/converters/utils.py @@ -50,7 +50,7 @@ def fuse_low_rank_linear(spec, layers): print(f"fuse_low_rank_linear 1: {[layer.weight1.shape for layer in layers]}") print(f"fuse_low_rank_linear 2: {[layer.weight2.shape for layer in layers]}") spec.weight1 = concatenate([layer.weight1 for layer in layers], axis=1) - spec.weight2 = concatenate([layer.weight2 for layer in layers], axis=1) + spec.weight2 = concatenate([layer.weight2 for layer in layers]) bias_dtype = None for layer in layers: From 566e35bc1bed98380e052bb0f29513e4b71506d1 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Thu, 17 Apr 2025 07:42:39 -0700 Subject: [PATCH 21/64] test_axis_1 --- python/ctranslate2/converters/transformers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index a48bc091c..5d4bc7af6 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -391,7 +391,10 @@ def set_attention(self, spec, attention, self_attention=False, low_rank=False): utils.fuse_linear(spec.linear[0], split_layers[:1]) utils.fuse_linear(spec.linear[1], split_layers[1:]) - self.set_linear(spec.linear[-1], attention.out_proj) + if low_rank: + self.set_low_rank_linear(spec.linear[-1], attention.out_proj) + else: + self.set_linear(spec.linear[-1], attention.out_proj) def set_common_layers(self, spec, module): import math From 5230ee4157fd941a7bda591cd2fce8fda28d7e45 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Thu, 17 Apr 2025 08:04:29 -0700 Subject: [PATCH 22/64] test_axis_1 --- python/ctranslate2/converters/transformers.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 5d4bc7af6..0bb81c1ae 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -327,12 +327,16 @@ def set_encoder(self, spec, encoder, low_rank=False): layer.self_attn_layer_norm, ) - if low_rank: + if hasattr(layer.fc1, 'weight1'): self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) - self.set_low_rank_linear(layer_spec.ffn.linear_1, layer.fc2) else: self.set_linear(layer_spec.ffn.linear_0, layer.fc1) + + if hasattr(layer.fc2, 'weight1'): + self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) + else: self.set_linear(layer_spec.ffn.linear_1, layer.fc2) + self.set_layer_norm(layer_spec.ffn.layer_norm, layer.final_layer_norm) def set_decoder(self, spec, decoder): From 220ae0716689ff0081fb6ff701cabc6bb4476ce7 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Thu, 17 Apr 2025 08:14:24 -0700 Subject: [PATCH 23/64] test_axis_1 --- python/ctranslate2/converters/utils.py | 3 +-- python/tests/test_transformers.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python/ctranslate2/converters/utils.py b/python/ctranslate2/converters/utils.py index ec56e98ca..8b30ff6a3 100644 --- a/python/ctranslate2/converters/utils.py +++ b/python/ctranslate2/converters/utils.py @@ -47,8 +47,7 @@ def fuse_low_rank_linear(spec, layers): concatenate = torch.cat zeros = torch.zeros - print(f"fuse_low_rank_linear 1: {[layer.weight1.shape for layer in layers]}") - print(f"fuse_low_rank_linear 2: {[layer.weight2.shape for layer in layers]}") + spec.weight1 = concatenate([layer.weight1 for layer in layers], axis=1) spec.weight2 = concatenate([layer.weight2 for layer in layers]) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index 82235c4d9..bfd2a2895 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -763,7 +763,7 @@ def test_transformers_whisper( expected_no_speech_probs, ): import transformers - + print('device type here idiot:', str(device)) converter = ctranslate2.converters.TransformersConverter(model_name) output_dir = str(tmp_dir.join("ctranslate2_model")) output_dir = converter.convert(output_dir) From 5259c69565965c99fddec2f9d47b135b8e3f7f2a Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Thu, 17 Apr 2025 08:18:54 -0700 Subject: [PATCH 24/64] test_axis_1 --- python/tests/test_transformers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index bfd2a2895..2546814f9 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -763,6 +763,9 @@ def test_transformers_whisper( expected_no_speech_probs, ): import transformers + import torch + device = 'cuda' if torch.cuda.is_available() else 'cpu' + print('device type here idiot:', str(device)) converter = ctranslate2.converters.TransformersConverter(model_name) output_dir = str(tmp_dir.join("ctranslate2_model")) From c016c41c0f622bcd1bd07feafdb4257b7a9730ef Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Thu, 17 Apr 2025 08:20:21 -0700 Subject: [PATCH 25/64] test_axis_1 --- python/tests/test_transformers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index 2546814f9..3f8a3e526 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -764,9 +764,9 @@ def test_transformers_whisper( ): import transformers import torch + import sys device = 'cuda' if torch.cuda.is_available() else 'cpu' - - print('device type here idiot:', str(device)) + print(f'platform: ' {sys.platform}') converter = ctranslate2.converters.TransformersConverter(model_name) output_dir = str(tmp_dir.join("ctranslate2_model")) output_dir = converter.convert(output_dir) From bfd33862e7fd7df9d66d95d5f206e9aa199d30f1 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Thu, 17 Apr 2025 08:20:26 -0700 Subject: [PATCH 26/64] test_axis_1 --- python/tests/test_transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index 3f8a3e526..78d819edc 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -766,7 +766,7 @@ def test_transformers_whisper( import torch import sys device = 'cuda' if torch.cuda.is_available() else 'cpu' - print(f'platform: ' {sys.platform}') + print(f'platform: {sys.platform}') converter = ctranslate2.converters.TransformersConverter(model_name) output_dir = str(tmp_dir.join("ctranslate2_model")) output_dir = converter.convert(output_dir) From 58d2b5a2af0195e9eedda2d316c0a0570ab4a2ab Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Sun, 20 Apr 2025 20:16:28 -0700 Subject: [PATCH 27/64] logging --- python/ctranslate2/converters/transformers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 0bb81c1ae..aa9341e01 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -327,6 +327,9 @@ def set_encoder(self, spec, encoder, low_rank=False): layer.self_attn_layer_norm, ) + print(f"layer_spec.ffn.linear_0 has weight1: {hasattr(layer_spec.ffn.linear_0, 'weight1')}") + print(f"layer.fc1 has weight1: {hasattr(layer.fc1, 'weight1')}") + if hasattr(layer.fc1, 'weight1'): self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) else: From 2b0fbf11445c9363222654b6695671d0d20b6762 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Sun, 20 Apr 2025 20:19:53 -0700 Subject: [PATCH 28/64] logging --- python/ctranslate2/converters/transformers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index aa9341e01..90ce426f7 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -329,6 +329,8 @@ def set_encoder(self, spec, encoder, low_rank=False): print(f"layer_spec.ffn.linear_0 has weight1: {hasattr(layer_spec.ffn.linear_0, 'weight1')}") print(f"layer.fc1 has weight1: {hasattr(layer.fc1, 'weight1')}") + print(f"layer_spec.ffn.linear_0 has weight2: {hasattr(layer_spec.ffn.linear_0, 'weight2')}") + print(f"layer.fc1 has weight2: {hasattr(layer.fc1, 'weight2')}") if hasattr(layer.fc1, 'weight1'): self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) From 4aa77d8adb253b27525a35b59da4ee63e58b9e79 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Sun, 20 Apr 2025 20:20:53 -0700 Subject: [PATCH 29/64] logging --- python/ctranslate2/converters/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 90ce426f7..cdb4176a4 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -337,7 +337,7 @@ def set_encoder(self, spec, encoder, low_rank=False): else: self.set_linear(layer_spec.ffn.linear_0, layer.fc1) - if hasattr(layer.fc2, 'weight1'): + if hasattr(layer.fc2, 'weight2'): self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) else: self.set_linear(layer_spec.ffn.linear_1, layer.fc2) From 84f54d425ac7eb193910bfc90cf9bcd80823d2c7 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Sun, 20 Apr 2025 20:21:54 -0700 Subject: [PATCH 30/64] logging --- python/ctranslate2/converters/transformers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index cdb4176a4..52ce34329 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -329,15 +329,15 @@ def set_encoder(self, spec, encoder, low_rank=False): print(f"layer_spec.ffn.linear_0 has weight1: {hasattr(layer_spec.ffn.linear_0, 'weight1')}") print(f"layer.fc1 has weight1: {hasattr(layer.fc1, 'weight1')}") - print(f"layer_spec.ffn.linear_0 has weight2: {hasattr(layer_spec.ffn.linear_0, 'weight2')}") - print(f"layer.fc1 has weight2: {hasattr(layer.fc1, 'weight2')}") + print(f"layer_spec.ffn.linear_1 has weight1: {hasattr(layer_spec.ffn.linear_1, 'weight2')}") + print(f"layer.fc2 has weight1: {hasattr(layer.fc2, 'weight2')}") if hasattr(layer.fc1, 'weight1'): self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) else: self.set_linear(layer_spec.ffn.linear_0, layer.fc1) - if hasattr(layer.fc2, 'weight2'): + if hasattr(layer.fc2, 'weight1'): self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) else: self.set_linear(layer_spec.ffn.linear_1, layer.fc2) From e08a06e03155a85ae004c1ccc12c09afe97f0330 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Sun, 20 Apr 2025 20:23:40 -0700 Subject: [PATCH 31/64] logging --- python/ctranslate2/converters/transformers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 52ce34329..0f0dfd888 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -1052,6 +1052,9 @@ def architecture_name(self): return "LiteWhisperForConditionalGeneration" def get_model_spec(self, model): + print(f"Model: {model}") + print(f"Model class: {model.__class__.__name__}") + print(f"Model config: {model.config}") return super().get_model_spec(model, low_rank=True) @register_loader("Wav2Vec2Config") From ab31afcaf048ccd92ec6f101d50bdfcbe65600e6 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Sun, 20 Apr 2025 21:07:32 -0700 Subject: [PATCH 32/64] logging --- python/ctranslate2/converters/transformers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 0f0dfd888..2ad5208ed 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -329,8 +329,9 @@ def set_encoder(self, spec, encoder, low_rank=False): print(f"layer_spec.ffn.linear_0 has weight1: {hasattr(layer_spec.ffn.linear_0, 'weight1')}") print(f"layer.fc1 has weight1: {hasattr(layer.fc1, 'weight1')}") - print(f"layer_spec.ffn.linear_1 has weight1: {hasattr(layer_spec.ffn.linear_1, 'weight2')}") - print(f"layer.fc2 has weight1: {hasattr(layer.fc2, 'weight2')}") + print(f"layer_spec.ffn.linear_1 has weight1: {hasattr(layer_spec.ffn.linear_1, 'weight1')}") + print(f"layer.fc2 has weight1: {hasattr(layer.fc2, 'weight1')}") + print(f"layer.fc2 has weight: {hasattr(layer.fc2, 'weight')}") if hasattr(layer.fc1, 'weight1'): self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) From d6b924b668f72c77af4e145b926bb63cefae7571 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Sun, 20 Apr 2025 21:09:19 -0700 Subject: [PATCH 33/64] logging --- python/ctranslate2/converters/transformers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 2ad5208ed..05634c0c8 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -229,6 +229,9 @@ def set_layer_norm(self, spec, module): spec.beta = module.bias def set_linear(self, spec, module, quant_type=common_spec.Quantization.CT2): + print("set linear") + print(f"module: {module}") + print(f'spec: {spec}') if quant_type == common_spec.Quantization.CT2: spec.weight = module.weight else: @@ -1053,9 +1056,6 @@ def architecture_name(self): return "LiteWhisperForConditionalGeneration" def get_model_spec(self, model): - print(f"Model: {model}") - print(f"Model class: {model.__class__.__name__}") - print(f"Model config: {model.config}") return super().get_model_spec(model, low_rank=True) @register_loader("Wav2Vec2Config") From 58c7547981e47f3287573714a907019053268437 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Sun, 20 Apr 2025 21:12:45 -0700 Subject: [PATCH 34/64] logging --- python/ctranslate2/converters/transformers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 05634c0c8..5702f3c5d 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -232,6 +232,9 @@ def set_linear(self, spec, module, quant_type=common_spec.Quantization.CT2): print("set linear") print(f"module: {module}") print(f'spec: {spec}') + print(module.weight) + spec.weight = 5 + print(spec.weight) if quant_type == common_spec.Quantization.CT2: spec.weight = module.weight else: From 94e6b34e15f4dc90ab10bbef0cabb4293a3e7129 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Sun, 20 Apr 2025 21:16:40 -0700 Subject: [PATCH 35/64] logging --- python/ctranslate2/converters/transformers.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 5702f3c5d..2d49270e6 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -229,12 +229,6 @@ def set_layer_norm(self, spec, module): spec.beta = module.bias def set_linear(self, spec, module, quant_type=common_spec.Quantization.CT2): - print("set linear") - print(f"module: {module}") - print(f'spec: {spec}') - print(module.weight) - spec.weight = 5 - print(spec.weight) if quant_type == common_spec.Quantization.CT2: spec.weight = module.weight else: @@ -333,20 +327,18 @@ def set_encoder(self, spec, encoder, low_rank=False): layer.self_attn_layer_norm, ) - print(f"layer_spec.ffn.linear_0 has weight1: {hasattr(layer_spec.ffn.linear_0, 'weight1')}") - print(f"layer.fc1 has weight1: {hasattr(layer.fc1, 'weight1')}") - print(f"layer_spec.ffn.linear_1 has weight1: {hasattr(layer_spec.ffn.linear_1, 'weight1')}") - print(f"layer.fc2 has weight1: {hasattr(layer.fc2, 'weight1')}") - print(f"layer.fc2 has weight: {hasattr(layer.fc2, 'weight')}") - if hasattr(layer.fc1, 'weight1'): + layer_spec.ffn.linear_0 = common_spec.LowRankLinearSpec() self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) else: + layer_spec.ffn.linear_0 = common_spec.LinearSpec() self.set_linear(layer_spec.ffn.linear_0, layer.fc1) if hasattr(layer.fc2, 'weight1'): + layer_spec.ffn.linear_1 = common_spec.LowRankLinearSpec() self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) else: + layer_spec.ffn.linear_1 = common_spec.LinearSpec() self.set_linear(layer_spec.ffn.linear_1, layer.fc2) self.set_layer_norm(layer_spec.ffn.layer_norm, layer.final_layer_norm) From 97aeeeb742f853435f21eb24547d2efc051d972a Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Sun, 20 Apr 2025 21:40:05 -0700 Subject: [PATCH 36/64] logging --- python/ctranslate2/converters/transformers.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 2d49270e6..c4de0c6e0 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -328,14 +328,12 @@ def set_encoder(self, spec, encoder, low_rank=False): ) if hasattr(layer.fc1, 'weight1'): - layer_spec.ffn.linear_0 = common_spec.LowRankLinearSpec() self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) else: layer_spec.ffn.linear_0 = common_spec.LinearSpec() self.set_linear(layer_spec.ffn.linear_0, layer.fc1) if hasattr(layer.fc2, 'weight1'): - layer_spec.ffn.linear_1 = common_spec.LowRankLinearSpec() self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) else: layer_spec.ffn.linear_1 = common_spec.LinearSpec() @@ -374,16 +372,24 @@ def set_decoder(self, spec, decoder): def set_attention(self, spec, attention, self_attention=False, low_rank=False): split_layers = [ - (common_spec.LowRankLinearSpec() if low_rank else common_spec.LinearSpec()) - for _ in range(3) + common_spec.LowRankLinearSpec() if hasattr(attention.q_proj, 'weight1') else common_spec.LinearSpec(), + common_spec.LowRankLinearSpec() if hasattr(attention.k_proj, 'weight2') else common_spec.LinearSpec(), + common_spec.LowRankLinearSpec() if hasattr(attention.v_proj, 'weight3') else common_spec.LinearSpec(), ] - if low_rank: + + if hasattr(split_layers[0], "weight1"): self.set_low_rank_linear(split_layers[0], attention.q_proj) - self.set_low_rank_linear(split_layers[1], attention.k_proj) - self.set_low_rank_linear(split_layers[2], attention.v_proj) else: self.set_linear(split_layers[0], attention.q_proj) + + if hasattr(split_layers[1], "weight1"): + self.set_low_rank_linear(split_layers[1], attention.k_proj) + else: self.set_linear(split_layers[1], attention.k_proj) + + if hasattr(split_layers[2], "weight1"): + self.set_low_rank_linear(split_layers[2], attention.v_proj) + else: self.set_linear(split_layers[2], attention.v_proj) if self_attention: From b113821519cbdb75eb387701027033d5ed3a74ea Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Sun, 20 Apr 2025 21:41:05 -0700 Subject: [PATCH 37/64] logging --- python/ctranslate2/converters/transformers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index c4de0c6e0..2f68271bd 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -373,8 +373,8 @@ def set_decoder(self, spec, decoder): def set_attention(self, spec, attention, self_attention=False, low_rank=False): split_layers = [ common_spec.LowRankLinearSpec() if hasattr(attention.q_proj, 'weight1') else common_spec.LinearSpec(), - common_spec.LowRankLinearSpec() if hasattr(attention.k_proj, 'weight2') else common_spec.LinearSpec(), - common_spec.LowRankLinearSpec() if hasattr(attention.v_proj, 'weight3') else common_spec.LinearSpec(), + common_spec.LowRankLinearSpec() if hasattr(attention.k_proj, 'weight1') else common_spec.LinearSpec(), + common_spec.LowRankLinearSpec() if hasattr(attention.v_proj, 'weight1') else common_spec.LinearSpec(), ] if hasattr(split_layers[0], "weight1"): From 77837b9ddb0192938b768d2672aadcd28f646d27 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Sun, 20 Apr 2025 21:48:04 -0700 Subject: [PATCH 38/64] logging --- python/ctranslate2/converters/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/ctranslate2/converters/utils.py b/python/ctranslate2/converters/utils.py index 8b30ff6a3..cf9b23c18 100644 --- a/python/ctranslate2/converters/utils.py +++ b/python/ctranslate2/converters/utils.py @@ -48,8 +48,9 @@ def fuse_low_rank_linear(spec, layers): concatenate = torch.cat zeros = torch.zeros - spec.weight1 = concatenate([layer.weight1 for layer in layers], axis=1) - spec.weight2 = concatenate([layer.weight2 for layer in layers]) + # TODO(eyoel-gebre): maybe don't concat fallback linear to low_rank? + spec.weight1 = concatenate([layer.weight if not hasattr(layer, 'weight1') else layer.weight1 for layer in layers], axis=1) + spec.weight2 = concatenate([layer.weight if not hasattr(layer, 'weight2') else layer.weight2 for layer in layers]) bias_dtype = None for layer in layers: From 98e8941562c2525546ff7f9ad1c66438bcc3b012 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Sun, 20 Apr 2025 22:08:16 -0700 Subject: [PATCH 39/64] logging --- python/ctranslate2/converters/transformers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 2f68271bd..eed329aa9 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -405,9 +405,10 @@ def set_attention(self, spec, attention, self_attention=False, low_rank=False): utils.fuse_linear(spec.linear[0], split_layers[:1]) utils.fuse_linear(spec.linear[1], split_layers[1:]) - if low_rank: + if hasattr(attention.out_proj, "weight1"): self.set_low_rank_linear(spec.linear[-1], attention.out_proj) else: + self.linear[-1] = common_spec.LinearSpec() self.set_linear(spec.linear[-1], attention.out_proj) def set_common_layers(self, spec, module): From b6c8edc8a455c07b464b9a73a378e6793e4eec76 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Sun, 20 Apr 2025 22:08:50 -0700 Subject: [PATCH 40/64] logging --- python/ctranslate2/converters/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index eed329aa9..38361b5c1 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -408,7 +408,7 @@ def set_attention(self, spec, attention, self_attention=False, low_rank=False): if hasattr(attention.out_proj, "weight1"): self.set_low_rank_linear(spec.linear[-1], attention.out_proj) else: - self.linear[-1] = common_spec.LinearSpec() + spec.linear[-1] = common_spec.LinearSpec() self.set_linear(spec.linear[-1], attention.out_proj) def set_common_layers(self, spec, module): From c1c5c47d976cb5f9eacd14d124b6833f0510e5c8 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Sun, 20 Apr 2025 22:10:15 -0700 Subject: [PATCH 41/64] spec --- python/ctranslate2/converters/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 38361b5c1..3fa5ea42a 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -334,7 +334,7 @@ def set_encoder(self, spec, encoder, low_rank=False): self.set_linear(layer_spec.ffn.linear_0, layer.fc1) if hasattr(layer.fc2, 'weight1'): - self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) + self.set_low_rank_linear(layer_spec.ffn.linear_1, layer.fc2) else: layer_spec.ffn.linear_1 = common_spec.LinearSpec() self.set_linear(layer_spec.ffn.linear_1, layer.fc2) From 99ab9637361630cada4b65d75e9e6c368dada053 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Tue, 22 Apr 2025 06:51:39 -0700 Subject: [PATCH 42/64] remove-test --- python/ctranslate2/converters/transformers.py | 8 +--- python/tests/faster-whisper-test.py | 10 +++++ python/tests/test_transformers.py | 37 ------------------- 3 files changed, 12 insertions(+), 43 deletions(-) create mode 100644 python/tests/faster-whisper-test.py diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 3fa5ea42a..9da66e315 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -398,12 +398,8 @@ def set_attention(self, spec, attention, self_attention=False, low_rank=False): else: utils.fuse_linear(spec.linear[0], split_layers) else: - if low_rank: - utils.fuse_low_rank_linear(spec.linear[0], split_layers[:1]) - utils.fuse_low_rank_linear(spec.linear[1], split_layers[1:]) - else: - utils.fuse_linear(spec.linear[0], split_layers[:1]) - utils.fuse_linear(spec.linear[1], split_layers[1:]) + utils.fuse_linear(spec.linear[0], split_layers[:1]) + utils.fuse_linear(spec.linear[1], split_layers[1:]) if hasattr(attention.out_proj, "weight1"): self.set_low_rank_linear(spec.linear[-1], attention.out_proj) diff --git a/python/tests/faster-whisper-test.py b/python/tests/faster-whisper-test.py new file mode 100644 index 000000000..da1fccec7 --- /dev/null +++ b/python/tests/faster-whisper-test.py @@ -0,0 +1,10 @@ +from faster_whisper import WhisperModel + +model = WhisperModel("testou") + +segments, info = model.transcribe("harvard.wav", beam_size=5) + +print("Detected language '%s' with probability %f" % (info.language, info.language_probability)) + +for segment in segments: + print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) \ No newline at end of file diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index 78d819edc..51e862090 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -835,43 +835,6 @@ def _get_features(audio): transcription = processor.decode(token_ids) assert transcription == expected_transcription - @test_utils.only_on_linux - @test_utils.on_available_devices - def test_transformers_lite_whisper(tmpdir, device): - import transformers - - model_name = "efficient-speech/lite-whisper-tiny" - converter = ctranslate2.converters.TransformersConverter(model_name, trust_remote_code=True) - output_dir = converter.convert('./ctranslate2_model') - - audio_path = os.path.join(test_utils.get_data_dir(), "audio", "mr_quilter.npy") - audio = np.load(audio_path) - - processor = transformers.WhisperProcessor.from_pretrained(model_name) - inputs = processor(audio, return_tensors="np", sampling_rate=16000) - features = ctranslate2.StorageView.from_array(inputs.input_features) - - model = ctranslate2.models.WhisperModel.from_path(output_dir, device=device) - - results = model.detect_language(features) - best_lang, best_prob = results[0][0] - assert best_lang == "<|en|>" - assert best_prob > 0.9 - - prompt = processor.get_decoder_prompt_ids(language="en", task="transcribe") - prompt = [token for _, token in prompt] - - results = model.generate(features, [prompt]) - - transcription = processor.decode( - results[0].sequences_ids[0], skip_special_tokens=True - ) - assert transcription == ( - " Mr. Quilter is the apostle of the middle classes " - "and we are glad to welcome his gospel." - ) - - @test_utils.only_on_linux @test_utils.on_available_devices @pytest.mark.parametrize( From 367b6c6b52ae5d149e7920751afc202f543d35a6 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Tue, 22 Apr 2025 06:54:25 -0700 Subject: [PATCH 43/64] remove-garbage --- .vscode/settings.json | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index a3b2b51f1..000000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "files.associations": { - "vector": "cpp" - } -} \ No newline at end of file From d7dc1074a4535313c992a233920672d3e65c41bc Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Tue, 22 Apr 2025 07:01:36 -0700 Subject: [PATCH 44/64] remove-garbage --- python/tests/faster-whisper-test.py | 10 ---------- python/tests/test_transformers.py | 5 +---- 2 files changed, 1 insertion(+), 14 deletions(-) delete mode 100644 python/tests/faster-whisper-test.py diff --git a/python/tests/faster-whisper-test.py b/python/tests/faster-whisper-test.py deleted file mode 100644 index da1fccec7..000000000 --- a/python/tests/faster-whisper-test.py +++ /dev/null @@ -1,10 +0,0 @@ -from faster_whisper import WhisperModel - -model = WhisperModel("testou") - -segments, info = model.transcribe("harvard.wav", beam_size=5) - -print("Detected language '%s' with probability %f" % (info.language, info.language_probability)) - -for segment in segments: - print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) \ No newline at end of file diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py index 51e862090..1fed8196d 100644 --- a/python/tests/test_transformers.py +++ b/python/tests/test_transformers.py @@ -763,10 +763,7 @@ def test_transformers_whisper( expected_no_speech_probs, ): import transformers - import torch - import sys - device = 'cuda' if torch.cuda.is_available() else 'cpu' - print(f'platform: {sys.platform}') + converter = ctranslate2.converters.TransformersConverter(model_name) output_dir = str(tmp_dir.join("ctranslate2_model")) output_dir = converter.convert(output_dir) From 93e4a70c9e739a6d18acb8f94e1465ef613ca8ef Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Wed, 30 Apr 2025 08:12:46 -0700 Subject: [PATCH 45/64] updated model executor for lite-whisper --- .gitignore | 2 + include/ctranslate2/layers/attention_layer.h | 1 + include/ctranslate2/layers/common.h | 2 + python/ctranslate2/converters/transformers.py | 136 +++++++++++------- python/ctranslate2/converters/utils.py | 35 ----- python/ctranslate2/specs/attention_spec.py | 9 +- python/ctranslate2/specs/common_spec.py | 4 +- src/layers/attention.cc | 25 +++- src/layers/attention_layer.cc | 24 +++- src/layers/common.cc | 24 +++- 10 files changed, 159 insertions(+), 103 deletions(-) diff --git a/.gitignore b/.gitignore index 9c6801c43..4348bc598 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ python/dist/ .cache docs/build/ docs/python/ + +.vscode diff --git a/include/ctranslate2/layers/attention_layer.h b/include/ctranslate2/layers/attention_layer.h index e55ecc5de..941a04913 100644 --- a/include/ctranslate2/layers/attention_layer.h +++ b/include/ctranslate2/layers/attention_layer.h @@ -52,6 +52,7 @@ namespace ctranslate2 { const bool multi_query = false); protected: + bool _is_low_rank; const bool _tensor_parallel; const dim_t _num_heads; const bool _self_attention; diff --git a/include/ctranslate2/layers/common.h b/include/ctranslate2/layers/common.h index 137b926d3..b6d8a29bd 100644 --- a/include/ctranslate2/layers/common.h +++ b/include/ctranslate2/layers/common.h @@ -135,7 +135,9 @@ namespace ctranslate2 { void select_weights(const StorageView* index, const StorageView* extra_bias = nullptr); private: bool _packed_weight; + bool _is_low_rank; const StorageView& _weight; + const StorageView* _weight2; const StorageView* _bias; const StorageView* _qscale; const StorageView* _qzero; diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 9da66e315..69b0ecc66 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -243,11 +243,11 @@ def set_linear(self, spec, module, quant_type=common_spec.Quantization.CT2): def set_low_rank_linear(self, spec, module, quant_type=common_spec.Quantization.CT2): if quant_type == common_spec.Quantization.CT2: - spec.weight1 = module.weight1 - spec.weight2 = module.weight2 + spec.low_rank_weight1 = module.weight1 + spec.low_rank_weight2 = module.weight2 else: - spec.weight1 = module.qweight1 - spec.weight2 = module.qweight2 + spec.low_rank_weight1 = module.qweight1 + spec.low_rank_weight2 = module.qweight2 spec.weight_scale = module.scales spec.weight_zero = module.qzeros @@ -312,7 +312,7 @@ def set_config(self, config, model, tokenizer): model.config.decoder_start_token_id ) - def set_encoder(self, spec, encoder, low_rank=False): + def set_encoder(self, spec, encoder): self.set_common_layers(spec, encoder) for layer_spec, layer in zip(spec.layer, encoder.layers): @@ -320,25 +320,14 @@ def set_encoder(self, spec, encoder, low_rank=False): layer_spec.self_attention, layer.self_attn, self_attention=True, - low_rank=low_rank, ) self.set_layer_norm( layer_spec.self_attention.layer_norm, layer.self_attn_layer_norm, ) - if hasattr(layer.fc1, 'weight1'): - self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) - else: - layer_spec.ffn.linear_0 = common_spec.LinearSpec() - self.set_linear(layer_spec.ffn.linear_0, layer.fc1) - - if hasattr(layer.fc2, 'weight1'): - self.set_low_rank_linear(layer_spec.ffn.linear_1, layer.fc2) - else: - layer_spec.ffn.linear_1 = common_spec.LinearSpec() - self.set_linear(layer_spec.ffn.linear_1, layer.fc2) - + self.set_linear(layer_spec.ffn.linear_0, layer.fc1) + self.set_linear(layer_spec.ffn.linear_1, layer.fc2) self.set_layer_norm(layer_spec.ffn.layer_norm, layer.final_layer_norm) def set_decoder(self, spec, decoder): @@ -370,42 +359,19 @@ def set_decoder(self, spec, decoder): self.set_linear(layer_spec.ffn.linear_1, layer.fc2) self.set_layer_norm(layer_spec.ffn.layer_norm, layer.final_layer_norm) - def set_attention(self, spec, attention, self_attention=False, low_rank=False): - split_layers = [ - common_spec.LowRankLinearSpec() if hasattr(attention.q_proj, 'weight1') else common_spec.LinearSpec(), - common_spec.LowRankLinearSpec() if hasattr(attention.k_proj, 'weight1') else common_spec.LinearSpec(), - common_spec.LowRankLinearSpec() if hasattr(attention.v_proj, 'weight1') else common_spec.LinearSpec(), - ] - - if hasattr(split_layers[0], "weight1"): - self.set_low_rank_linear(split_layers[0], attention.q_proj) - else: - self.set_linear(split_layers[0], attention.q_proj) - - if hasattr(split_layers[1], "weight1"): - self.set_low_rank_linear(split_layers[1], attention.k_proj) - else: - self.set_linear(split_layers[1], attention.k_proj) - - if hasattr(split_layers[2], "weight1"): - self.set_low_rank_linear(split_layers[2], attention.v_proj) - else: - self.set_linear(split_layers[2], attention.v_proj) + def set_attention(self, spec, attention, self_attention=False): + split_layers = [common_spec.LinearSpec() for _ in range(3)] + self.set_linear(split_layers[0], attention.q_proj) + self.set_linear(split_layers[1], attention.k_proj) + self.set_linear(split_layers[2], attention.v_proj) if self_attention: - if low_rank: - utils.fuse_low_rank_linear(spec.linear[0], split_layers) - else: - utils.fuse_linear(spec.linear[0], split_layers) + utils.fuse_linear(spec.linear[0], split_layers) else: utils.fuse_linear(spec.linear[0], split_layers[:1]) utils.fuse_linear(spec.linear[1], split_layers[1:]) - if hasattr(attention.out_proj, "weight1"): - self.set_low_rank_linear(spec.linear[-1], attention.out_proj) - else: - spec.linear[-1] = common_spec.LinearSpec() - self.set_linear(spec.linear[-1], attention.out_proj) + self.set_linear(spec.linear[-1], attention.out_proj) def set_common_layers(self, spec, module): import math @@ -952,16 +918,15 @@ class WhisperLoader(BartLoader): def architecture_name(self): return "WhisperForConditionalGeneration" - def get_model_spec(self, model, low_rank=False): + def get_model_spec(self, model): spec = whisper_spec.WhisperSpec( model.config.encoder_layers, model.config.encoder_attention_heads, model.config.decoder_layers, model.config.decoder_attention_heads, - low_rank=low_rank, ) - self.set_encoder(spec.encoder, model.model.encoder, low_rank=low_rank) + self.set_encoder(spec.encoder, model.model.encoder) self.set_decoder(spec.decoder, model.model.decoder) self.set_linear(spec.decoder.projection, model.proj_out) @@ -1030,10 +995,10 @@ def get_vocabulary(self, model, tokenizer): def set_vocabulary(self, spec, tokens): spec.register_vocabulary(tokens) - def set_encoder(self, spec, encoder, low_rank=False): + def set_encoder(self, spec, encoder): self.set_conv1d(spec.conv1, encoder.conv1) self.set_conv1d(spec.conv2, encoder.conv2) - super().set_encoder(spec, encoder, low_rank=low_rank) + super().set_encoder(spec, encoder) def set_decoder(self, spec, decoder): self.set_embeddings(spec.embeddings, decoder.embed_tokens) @@ -1054,7 +1019,70 @@ def architecture_name(self): return "LiteWhisperForConditionalGeneration" def get_model_spec(self, model): - return super().get_model_spec(model, low_rank=True) + spec = whisper_spec.WhisperSpec( + model.config.encoder_layers, + model.config.encoder_attention_heads, + model.config.decoder_layers, + model.config.decoder_attention_heads, + low_rank=True, + ) + + self.set_encoder(spec.encoder, model.model.encoder) + self.set_decoder(spec.decoder, model.model.decoder) + self.set_linear(spec.decoder.projection, model.proj_out) + + return spec + + def set_encoder(self, spec, encoder): + self.set_conv1d(spec.conv1, encoder.conv1) + self.set_conv1d(spec.conv2, encoder.conv2) + + self.set_common_layers(spec, encoder) + + for layer_spec, layer in zip(spec.layer, encoder.layers): + self.set_low_rank_attention( + layer_spec.self_attention, + layer.self_attn, + self_attention=True, + ) + self.set_layer_norm( + layer_spec.self_attention.layer_norm, + layer.self_attn_layer_norm, + ) + + # Double check if these are low rank or not because of potential + # fall backs to full precision. + if hasattr(layer.fc1, 'weight1'): + self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) + else: + layer_spec.ffn.linear_0 = common_spec.LinearSpec() + self.set_linear(layer_spec.ffn.linear_0, layer.fc1) + + if hasattr(layer.fc2, 'weight1'): + self.set_low_rank_linear(layer_spec.ffn.linear_1, layer.fc2) + else: + layer_spec.ffn.linear_1 = common_spec.LinearSpec() + self.set_linear(layer_spec.ffn.linear_1, layer.fc2) + + self.set_layer_norm(layer_spec.ffn.layer_norm, layer.final_layer_norm) + + def set_low_rank_attention(self, spec, attention): + if hasattr(attention.q_proj, "weight1"): + self.set_low_rank_linear(spec.linear[0], attention.q_proj) + else: + self.set_linear(spec.linear[0], attention.q_proj) + + if hasattr(attention.k_proj, "weight1"): + self.set_low_rank_linear(spec.linear[1], attention.k_proj) + else: + self.set_linear(spec.linear[1], attention.k_proj) + + if hasattr(attention.v_pro, "weight1"): + self.set_low_rank_linear(spec.linear[2], attention.v_proj) + else: + self.set_linear(spec.linear[2], attention.v_proj) + + self.set_linear(spec.linear[-1], attention.out_proj) @register_loader("Wav2Vec2Config") class Wav2Vec2Loader(BartLoader): diff --git a/python/ctranslate2/converters/utils.py b/python/ctranslate2/converters/utils.py index cf9b23c18..c744ff925 100644 --- a/python/ctranslate2/converters/utils.py +++ b/python/ctranslate2/converters/utils.py @@ -35,41 +35,6 @@ def fuse_linear(spec, layers): ) -def fuse_low_rank_linear(spec, layers): - if not layers: - raise ValueError("Cannot fuse low rank linear layers: at least one layer is required") - - if isinstance(layers[0].weight1, np.ndarray): - concatenate = np.concatenate - zeros = np.zeros - else: - import torch - - concatenate = torch.cat - zeros = torch.zeros - - # TODO(eyoel-gebre): maybe don't concat fallback linear to low_rank? - spec.weight1 = concatenate([layer.weight if not hasattr(layer, 'weight1') else layer.weight1 for layer in layers], axis=1) - spec.weight2 = concatenate([layer.weight if not hasattr(layer, 'weight2') else layer.weight2 for layer in layers]) - - bias_dtype = None - for layer in layers: - if layer.has_bias(): - bias_dtype = layer.bias.dtype - break - - if bias_dtype is not None: - spec.bias = concatenate( - [ - ( - layer.bias - if layer.has_bias() - else zeros([layer.weight1.shape[0]], dtype=bias_dtype) - ) - for layer in layers - ] - ) - def fuse_linear_prequant(spec, layers, axis): if not layers: raise ValueError("Cannot fuse linear layers: at least one layer is required") diff --git a/python/ctranslate2/specs/attention_spec.py b/python/ctranslate2/specs/attention_spec.py index cd44802d4..1e90c2246 100644 --- a/python/ctranslate2/specs/attention_spec.py +++ b/python/ctranslate2/specs/attention_spec.py @@ -37,9 +37,12 @@ def __init__( self.queries_scale = model_spec.OPTIONAL self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm) - self.linear = [ - common_spec.LinearSpec() if not low_rank else common_spec.LowRankLinearSpec() for _ in range(2 if self_attention else 3) - ] + if low_rank: + self.linear = [common_spec.LowRankLinearSpec() for _ in range(4)] + else: + self.linear = [ + common_spec.LinearSpec() for _ in range(2 if self_attention else 3) + ] if relative_position: self.relative_position_keys = None diff --git a/python/ctranslate2/specs/common_spec.py b/python/ctranslate2/specs/common_spec.py index 4e048ce2d..56577cfc9 100644 --- a/python/ctranslate2/specs/common_spec.py +++ b/python/ctranslate2/specs/common_spec.py @@ -54,8 +54,8 @@ def has_bias(self): class LowRankLinearSpec(model_spec.LayerSpec): def __init__(self): super().__init__() - self.weight1 = None - self.weight2 = None + self.low_rank_weight1 = None + self.low_rank_weight2 = None self.weight_scale = model_spec.OPTIONAL self.weight_zero = model_spec.OPTIONAL self.bias = model_spec.OPTIONAL diff --git a/src/layers/attention.cc b/src/layers/attention.cc index 6ad344410..239c0046c 100644 --- a/src/layers/attention.cc +++ b/src/layers/attention.cc @@ -334,6 +334,7 @@ namespace ctranslate2 { return _d_model; } + // TODO: Maybe need to also impl in flash attn. void MultiHeadAttention::operator()(const StorageView& queries, const StorageView& values, const StorageView* values_lengths, @@ -360,13 +361,22 @@ namespace ctranslate2 { q = &queries_proj; } - _linear[0](*q, fused_proj); + if (!_is_low_rank) { + _linear[0](*q, fused_proj); + } else { + // Low-rank attention does not fuse qkv. + _linear[0](*q, queries_proj); + _linear[1](*q, keys_proj); + _linear[2](*q, values_proj); + } dim_t beam_size = 1; bool prefilling = (_sliding_window > 0 && values_lengths); if (!_self_attention) { + if (_is_low_rank) + throw std::invalid_argument("MultiHeadAttention does not support low-rank attention with cross-attention"); queries_proj = std::move(fused_proj); if (cached_keys == nullptr || cached_keys->empty()) { @@ -401,6 +411,8 @@ namespace ctranslate2 { } else { if (_num_heads_kv < _num_heads) { + if (_is_low_rank) + throw std::invalid_argument("MutliHeadAttention does not support low-rank attention with multi-query or GQA"); if (queries_padder) queries_padder->add_padding(fused_proj); @@ -419,8 +431,15 @@ namespace ctranslate2 { } } else { - split_heads(fused_proj, 3 * _num_heads, queries_padder); - ops::Split(1)(fused_proj, queries_proj, keys_proj, values_proj); + if (!_is_low_rank) { + split_heads(fused_proj, 3 * _num_heads, queries_padder); + ops::Split(1)(fused_proj, queries_proj, keys_proj, values_proj); + } else { + // No GQA or multi-query attention means each head has the same size. + split_heads(queries_proj, _num_heads, queries_padder); + split_heads(keys_proj, _num_heads, queries_padder); + split_heads(values_proj, _num_heads, queries_padder); + } } if (_rotary_embeddings) { diff --git a/src/layers/attention_layer.cc b/src/layers/attention_layer.cc index c9ae67409..38064656e 100644 --- a/src/layers/attention_layer.cc +++ b/src/layers/attention_layer.cc @@ -51,10 +51,25 @@ namespace ctranslate2 { return alibi; } + static bool set_low_rank(const models::Model& model, const std::string& scope) { + const StorageView* low_rank_weight = model.get_variable_if_exists(scope + "/linear_0/low_rank_weight_1"); + if (low_rank_weight) { + return true; + } + return false; + } + static std::vector make_linear_layers(const models::Model& model, const std::string& scope, - bool self_attention) { - const dim_t num_linear_layers = self_attention ? 2 : 3; + bool self_attention, + bool _is_low_rank) { + dim_t num_linear_layers; + if (!_is_low_rank) { + num_linear_layers = self_attention ? 2 : 3; + } else { + num_linear_layers = 4; + } + std::vector layers; layers.reserve(num_linear_layers); for (dim_t i = 0; i < num_linear_layers; ++i) @@ -117,11 +132,12 @@ namespace ctranslate2 { bool is_decoder, Alibi* alibi, bool is_flash_attn) - : _tensor_parallel(model.tensor_parallel()) + : _is_low_rank(set_low_rank(model, scope)) + , _tensor_parallel(model.tensor_parallel()) , _num_heads(_tensor_parallel ? SAFE_DIVIDE(num_heads, ScopedMPISetter::getNRanks()) : num_heads) , _self_attention(self_attention) , _is_decoder(is_decoder) - , _linear(make_linear_layers(model, scope, self_attention)) + , _linear(make_linear_layers(model, scope, self_attention, _is_low_rank)) , _d_model(_tensor_parallel ? SAFE_DIVIDE(_linear.back().output_size(), ScopedMPISetter::getNRanks()) : _linear.back().output_size()) , _d_head(model.get_attribute_with_default(scope + "/head_dim", _d_model / _num_heads)) , _pre_norm(pre_norm) diff --git a/src/layers/common.cc b/src/layers/common.cc index c6d1cd0b5..10f95666b 100644 --- a/src/layers/common.cc +++ b/src/layers/common.cc @@ -250,6 +250,13 @@ namespace ctranslate2 { return _encoding.dim(1); } + static bool set_low_rank(const models::Model& model, const std::string& scope) { + const StorageView* low_rank_weight = model.get_variable_if_exists(scope + "/low_rank_weight_1"); + if (low_rank_weight) { + return true; + } + return false; + } static const StorageView& get_linear_weight(const models::Model& model, const std::string& scope, @@ -268,7 +275,9 @@ namespace ctranslate2 { const ops::ActivationType* activation_type, const bool is_layer_out) : _packed_weight(false) - , _weight(get_linear_weight(model, scope, &_packed_weight)) + , _is_low_rank(set_low_rank(model, scope)) + , _weight(_is_low_rank ? *model.get_variable_if_exists(scope + "/low_rank_weight1") : get_linear_weight(model, scope, &_packed_weight)) + , _weight2(_is_low_rank ? model.get_variable_if_exists(scope + "/low_rank_weight2") : nullptr) , _bias(model.get_variable_if_exists(scope + "/bias")) , _qscale(model.get_variable_if_exists(scope + "/weight_scale")) , _qzero(model.get_variable_if_exists(scope + "/weight_zero")) @@ -307,6 +316,10 @@ namespace ctranslate2 { } dim_t Dense::output_size() const { + if (_is_low_rank) { + // TODO: Double check this + return _weight2->dim(0); + } return _partial_weight ? _partial_weight.dim(0) : _weight.dim(0); } @@ -340,6 +353,7 @@ namespace ctranslate2 { PROFILE("Dense"); const StorageView* qscale = _partial_qscale.empty() ? _qscale : &_partial_qscale; const StorageView* weight = _partial_weight.empty() ? &_weight : &_partial_weight; + const StorageView* weight2 = _is_low_rank ? _weight2 : nullptr; const StorageView* bias = _partial_bias.empty() ? _bias : &_partial_bias; const StorageView* compensation = (_partial_u8_shift_compensation.empty() ? _u8_shift_compensation @@ -349,6 +363,8 @@ namespace ctranslate2 { if (affected_by_tp && ScopedMPISetter::getCurRank() != 0) bias = nullptr; if (_quantized_gemm) { + if (_is_low_rank) + throw std::runtime_error("Low rank dense layer not supported with quantized gemm"); const auto device = input.device(); StorageView qinput(_weight.dtype(), device); StorageView qinput_scale(_qscale->dtype(), device); @@ -396,6 +412,8 @@ namespace ctranslate2 { output, bias); } else if (_qzero && _qscale) { + if (_is_low_rank) + throw std::runtime_error("Low rank dense layer not supported with quantized gemm"); switch (_quant_method) { case models::QUANTIZATION_TYPE::AWQ_GEMM: if (input.dim(0) * input.dim(1) >= 1024) { @@ -428,7 +446,9 @@ namespace ctranslate2 { "support only ct2 and awq quantization"); } } else { - _gemm_op(input, *weight, output, nullptr, bias); + StorageView& intermediate_output = output; + _gemm_op(input, *weight, intermediate_output, nullptr); + _gemm_op(intermediate_output, *weight2, output, nullptr, bias); } } From a29eef3cfe91934a41ef89dfcaf18d4bf1140150 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Wed, 30 Apr 2025 08:38:43 -0700 Subject: [PATCH 46/64] debugging --- python/ctranslate2/converters/transformers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 69b0ecc66..317b8add7 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -1043,7 +1043,6 @@ def set_encoder(self, spec, encoder): self.set_low_rank_attention( layer_spec.self_attention, layer.self_attn, - self_attention=True, ) self.set_layer_norm( layer_spec.self_attention.layer_norm, From 0b5fa40bbdd8efafcbb0772d6d2d8e03edf6c83b Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Wed, 30 Apr 2025 08:39:29 -0700 Subject: [PATCH 47/64] debugging --- python/ctranslate2/converters/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 317b8add7..3a47c2be0 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -1076,7 +1076,7 @@ def set_low_rank_attention(self, spec, attention): else: self.set_linear(spec.linear[1], attention.k_proj) - if hasattr(attention.v_pro, "weight1"): + if hasattr(attention.v_proj, "weight1"): self.set_low_rank_linear(spec.linear[2], attention.v_proj) else: self.set_linear(spec.linear[2], attention.v_proj) From 726666318292e357b8e17987718ca6907036f133 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Wed, 30 Apr 2025 08:40:47 -0700 Subject: [PATCH 48/64] debugging --- python/ctranslate2/converters/transformers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 3a47c2be0..d6370e2e3 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -1081,7 +1081,10 @@ def set_low_rank_attention(self, spec, attention): else: self.set_linear(spec.linear[2], attention.v_proj) - self.set_linear(spec.linear[-1], attention.out_proj) + if hasattr(attention.out_proj, "weight1"): + self.set_low_rank_linear(spec.linear[-1], attention.out_proj) + else + self.set_linear(spec.linear[-1], attention.out_proj) @register_loader("Wav2Vec2Config") class Wav2Vec2Loader(BartLoader): From dacb94982d63a348f3651422ac060b615f3062a5 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Wed, 30 Apr 2025 08:42:33 -0700 Subject: [PATCH 49/64] debugging --- python/ctranslate2/converters/transformers.py | 29 +++++++------------ 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index d6370e2e3..7b025138c 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -1065,26 +1065,17 @@ def set_encoder(self, spec, encoder): self.set_layer_norm(layer_spec.ffn.layer_norm, layer.final_layer_norm) - def set_low_rank_attention(self, spec, attention): - if hasattr(attention.q_proj, "weight1"): - self.set_low_rank_linear(spec.linear[0], attention.q_proj) - else: - self.set_linear(spec.linear[0], attention.q_proj) - - if hasattr(attention.k_proj, "weight1"): - self.set_low_rank_linear(spec.linear[1], attention.k_proj) - else: - self.set_linear(spec.linear[1], attention.k_proj) - - if hasattr(attention.v_proj, "weight1"): - self.set_low_rank_linear(spec.linear[2], attention.v_proj) + def set_low_rank_or_linear_router(self, spec, module): + if hasattr(module, "weight1"): + self.set_low_rank_linear(spec, module) else: - self.set_linear(spec.linear[2], attention.v_proj) - - if hasattr(attention.out_proj, "weight1"): - self.set_low_rank_linear(spec.linear[-1], attention.out_proj) - else - self.set_linear(spec.linear[-1], attention.out_proj) + self.set_linear(spec, module) + + def set_low_rank_attention(self, spec, attention): + self.set_low_rank_or_linear_router(spec.linear[0], attention.q_proj) + self.set_low_rank_or_linear_router(spec.linear[1], attention.k_proj) + self.set_low_rank_or_linear_router(spec.linear[2], attention.v_proj) + self.set_low_rank_or_linear_router(spec.linear[3], attention.out_proj) @register_loader("Wav2Vec2Config") class Wav2Vec2Loader(BartLoader): From ee0527c7e20442f47926e94a0099fe5a50199287 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Wed, 30 Apr 2025 08:49:23 -0700 Subject: [PATCH 50/64] debugging --- python/ctranslate2/converters/transformers.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 7b025138c..7e7d1c3b8 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -1065,17 +1065,18 @@ def set_encoder(self, spec, encoder): self.set_layer_norm(layer_spec.ffn.layer_norm, layer.final_layer_norm) - def set_low_rank_or_linear_router(self, spec, module): + def set_low_rank_or_linear_router(self, spec, i, module): if hasattr(module, "weight1"): - self.set_low_rank_linear(spec, module) + self.set_low_rank_linear(spec.linear[i], module) else: - self.set_linear(spec, module) + spec.linear[i] = common_spec.LinearSpec() + self.set_linear(spec.linear[i], module) def set_low_rank_attention(self, spec, attention): - self.set_low_rank_or_linear_router(spec.linear[0], attention.q_proj) - self.set_low_rank_or_linear_router(spec.linear[1], attention.k_proj) - self.set_low_rank_or_linear_router(spec.linear[2], attention.v_proj) - self.set_low_rank_or_linear_router(spec.linear[3], attention.out_proj) + self.set_low_rank_or_linear_router(spec, attention.q_proj, 0) + self.set_low_rank_or_linear_router(spec, attention.k_proj, 1) + self.set_low_rank_or_linear_router(spec, attention.v_proj, 2) + self.set_low_rank_or_linear_router(spec, attention.out_proj, 3) @register_loader("Wav2Vec2Config") class Wav2Vec2Loader(BartLoader): From 5cf90b9dca3c876d6c776d4006f3f8681e5f9474 Mon Sep 17 00:00:00 2001 From: eyoel gebre Date: Wed, 30 Apr 2025 08:50:19 -0700 Subject: [PATCH 51/64] debugging --- python/ctranslate2/converters/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 7e7d1c3b8..81db12240 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -1065,7 +1065,7 @@ def set_encoder(self, spec, encoder): self.set_layer_norm(layer_spec.ffn.layer_norm, layer.final_layer_norm) - def set_low_rank_or_linear_router(self, spec, i, module): + def set_low_rank_or_linear_router(self, spec, module, i): if hasattr(module, "weight1"): self.set_low_rank_linear(spec.linear[i], module) else: From 3098c959e23701e52d97748b304a46cdc5347b06 Mon Sep 17 00:00:00 2001 From: Eyoel Gebre Date: Thu, 1 May 2025 15:29:35 +0000 Subject: [PATCH 52/64] naming fix --- python/ctranslate2/converters/transformers.py | 8 ++++---- python/ctranslate2/specs/common_spec.py | 4 ++-- src/layers/common.cc | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 81db12240..fb6b3e1b6 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -243,11 +243,11 @@ def set_linear(self, spec, module, quant_type=common_spec.Quantization.CT2): def set_low_rank_linear(self, spec, module, quant_type=common_spec.Quantization.CT2): if quant_type == common_spec.Quantization.CT2: - spec.low_rank_weight1 = module.weight1 - spec.low_rank_weight2 = module.weight2 + spec.low_rank_weight_1 = module.weight1 + spec.low_rank_weight_2 = module.weight2 else: - spec.low_rank_weight1 = module.qweight1 - spec.low_rank_weight2 = module.qweight2 + spec.low_rank_weight_1 = module.qweight1 + spec.low_rank_weight_2 = module.qweight2 spec.weight_scale = module.scales spec.weight_zero = module.qzeros diff --git a/python/ctranslate2/specs/common_spec.py b/python/ctranslate2/specs/common_spec.py index 56577cfc9..4233d5fd7 100644 --- a/python/ctranslate2/specs/common_spec.py +++ b/python/ctranslate2/specs/common_spec.py @@ -54,8 +54,8 @@ def has_bias(self): class LowRankLinearSpec(model_spec.LayerSpec): def __init__(self): super().__init__() - self.low_rank_weight1 = None - self.low_rank_weight2 = None + self.low_rank_weight_1 = None + self.low_rank_weight_2 = None self.weight_scale = model_spec.OPTIONAL self.weight_zero = model_spec.OPTIONAL self.bias = model_spec.OPTIONAL diff --git a/src/layers/common.cc b/src/layers/common.cc index 10f95666b..29883065f 100644 --- a/src/layers/common.cc +++ b/src/layers/common.cc @@ -276,8 +276,8 @@ namespace ctranslate2 { const bool is_layer_out) : _packed_weight(false) , _is_low_rank(set_low_rank(model, scope)) - , _weight(_is_low_rank ? *model.get_variable_if_exists(scope + "/low_rank_weight1") : get_linear_weight(model, scope, &_packed_weight)) - , _weight2(_is_low_rank ? model.get_variable_if_exists(scope + "/low_rank_weight2") : nullptr) + , _weight(_is_low_rank ? *model.get_variable_if_exists(scope + "/low_rank_weight_1") : get_linear_weight(model, scope, &_packed_weight)) + , _weight2(_is_low_rank ? model.get_variable_if_exists(scope + "/low_rank_weight_2") : nullptr) , _bias(model.get_variable_if_exists(scope + "/bias")) , _qscale(model.get_variable_if_exists(scope + "/weight_scale")) , _qzero(model.get_variable_if_exists(scope + "/weight_zero")) From 5caaee5da8a9d82393fd02f9e09b0af2dda93890 Mon Sep 17 00:00:00 2001 From: Eyoel Gebre Date: Fri, 2 May 2025 02:52:57 +0000 Subject: [PATCH 53/64] . --- src/layers/attention.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/layers/attention.cc b/src/layers/attention.cc index 239c0046c..7a28d1e5f 100644 --- a/src/layers/attention.cc +++ b/src/layers/attention.cc @@ -334,7 +334,6 @@ namespace ctranslate2 { return _d_model; } - // TODO: Maybe need to also impl in flash attn. void MultiHeadAttention::operator()(const StorageView& queries, const StorageView& values, const StorageView* values_lengths, From 23df46043dd602431d5feda06653afc613782cf6 Mon Sep 17 00:00:00 2001 From: Eyoel Gebre Date: Fri, 2 May 2025 06:26:37 +0000 Subject: [PATCH 54/64] shapes --- src/layers/attention.cc | 4 ++-- src/layers/common.cc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/layers/attention.cc b/src/layers/attention.cc index 7a28d1e5f..4855ae386 100644 --- a/src/layers/attention.cc +++ b/src/layers/attention.cc @@ -436,8 +436,8 @@ namespace ctranslate2 { } else { // No GQA or multi-query attention means each head has the same size. split_heads(queries_proj, _num_heads, queries_padder); - split_heads(keys_proj, _num_heads, queries_padder); - split_heads(values_proj, _num_heads, queries_padder); + split_heads(keys_proj, _num_heads_kv, queries_padder); + split_heads(values_proj, _num_heads_kv, queries_padder); } } diff --git a/src/layers/common.cc b/src/layers/common.cc index 29883065f..035da4b6a 100644 --- a/src/layers/common.cc +++ b/src/layers/common.cc @@ -296,7 +296,7 @@ namespace ctranslate2 { , _gemm_op(/*alpha=*/1, /*beta=*/0, /*trans_a=*/false, - /*trans_b=*/true, + /*trans_b=*/ _is_low_rank ? false : true, /*a_is_packed=*/false, _packed_weight, _quantized_gemm ? nullptr : activation_type) From 6bdce8509f083fe6941c5a09ef2f2abd5777bb6e Mon Sep 17 00:00:00 2001 From: Eyoel Gebre Date: Fri, 2 May 2025 06:27:14 +0000 Subject: [PATCH 55/64] shapes2 --- src/layers/common.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/layers/common.cc b/src/layers/common.cc index 035da4b6a..b4be87ec4 100644 --- a/src/layers/common.cc +++ b/src/layers/common.cc @@ -445,6 +445,9 @@ namespace ctranslate2 { throw std::invalid_argument("Dense forward: invalid quantized type," "support only ct2 and awq quantization"); } + } else { + if (!_is_low_rank) { + _gemm_op(input, *weight, output, nullptr, bias); } else { StorageView& intermediate_output = output; _gemm_op(input, *weight, intermediate_output, nullptr); From 2821169e39a31270d83645ea40dc4ef711976b53 Mon Sep 17 00:00:00 2001 From: Eyoel Gebre Date: Fri, 2 May 2025 06:28:35 +0000 Subject: [PATCH 56/64] shape3 --- src/layers/common.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/layers/common.cc b/src/layers/common.cc index b4be87ec4..1f0612095 100644 --- a/src/layers/common.cc +++ b/src/layers/common.cc @@ -448,10 +448,11 @@ namespace ctranslate2 { } else { if (!_is_low_rank) { _gemm_op(input, *weight, output, nullptr, bias); - } else { - StorageView& intermediate_output = output; - _gemm_op(input, *weight, intermediate_output, nullptr); - _gemm_op(intermediate_output, *weight2, output, nullptr, bias); + } else { + StorageView& intermediate_output = output; + _gemm_op(input, *weight, intermediate_output, nullptr); + _gemm_op(intermediate_output, *weight2, output, nullptr, bias); + } } } From b4b35b3250289b622b1997a0e17c0e7f912b2a8d Mon Sep 17 00:00:00 2001 From: Eyoel Gebre Date: Sat, 3 May 2025 17:31:04 +0000 Subject: [PATCH 57/64] preprocessor --- python/ctranslate2/converters/transformers.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index fb6b3e1b6..657f3217f 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -3,6 +3,7 @@ import gc import itertools import os +import re from typing import List, Optional @@ -96,7 +97,13 @@ def __init__( trust_remote_code: Allow converting models using custom code. """ self._model_name_or_path = model_name_or_path - self._model_processor_name = (model_name_or_path if not model_name_or_path.startswith('efficient-speech/lite-whisper') else 'openai/whisper-large-v3') + self._model_processor_name = model_name_or_path + if model_name_or_path.startswith('efficient-speech/lite-whisper'): + # If this is a lite-whisper model, use openai's + # corresponding preprocessor. + regex = r'whisper-[a-z0-9-]+?(?=-(?:fast|acc)|$)' + regex_result = re.search(regex, model_name_or_path) + self._model_processor_name = f"openai/{regex_result.group()}" self._activation_scales = activation_scales self._copy_files = copy_files self._load_as_float16 = load_as_float16 From f2b7b229fe2da71e7a1e79024d4aa6ff543dc803 Mon Sep 17 00:00:00 2001 From: Eyoel Gebre Date: Sat, 3 May 2025 18:54:03 +0000 Subject: [PATCH 58/64] small --- python/ctranslate2/converters/transformers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 657f3217f..e55a00b14 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -127,6 +127,8 @@ def _load(self): % (config_name, ", ".join(sorted(_MODEL_LOADERS.keys()))) ) + tokenizer_class = transformers.AutoTokenizer + kwargs = { "torch_dtype": ( torch.float16 @@ -152,7 +154,6 @@ def _load(self): if self._trust_remote_code: tokenizer_kwargs["trust_remote_code"] = self._trust_remote_code - tokenizer_class = transformers.AutoTokenizer tokenizer = self.load_tokenizer( tokenizer_class, self._model_processor_name, **tokenizer_kwargs ) From af19fb4bcda11a7267771ae7245a3062a844f9ca Mon Sep 17 00:00:00 2001 From: Eyoel Gebre Date: Sat, 3 May 2025 18:54:22 +0000 Subject: [PATCH 59/64] small --- python/ctranslate2/converters/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index e55a00b14..eb04a3677 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -128,7 +128,7 @@ def _load(self): ) tokenizer_class = transformers.AutoTokenizer - + kwargs = { "torch_dtype": ( torch.float16 From 84e346a95a17538f452c7e90c0450a865d9f57d8 Mon Sep 17 00:00:00 2001 From: Eyoel Gebre Date: Sun, 4 May 2025 01:18:11 +0000 Subject: [PATCH 60/64] dims --- src/layers/common.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/layers/common.cc b/src/layers/common.cc index 1f0612095..01918cbdf 100644 --- a/src/layers/common.cc +++ b/src/layers/common.cc @@ -317,8 +317,10 @@ namespace ctranslate2 { dim_t Dense::output_size() const { if (_is_low_rank) { - // TODO: Double check this - return _weight2->dim(0); + if (_partial_weight) + throw std::runtime_error("Low rank dense layer does not support partial weights"); + // weight is transposed when low_rank + return _weight2->dim(1); } return _partial_weight ? _partial_weight.dim(0) : _weight.dim(0); } From 6ac5325e8f0d1b39261e7454249ae8d2fc398f36 Mon Sep 17 00:00:00 2001 From: Eyoel Gebre Date: Sun, 4 May 2025 01:24:40 +0000 Subject: [PATCH 61/64] tweaks --- python/ctranslate2/converters/transformers.py | 5 ++++- src/layers/attention.cc | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index eb04a3677..9c7095d7b 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -147,8 +147,11 @@ def _load(self): if hasattr(transformers, loader.architecture_name): model_class = getattr(transformers, loader.architecture_name) model = self.load_model(model_class, self._model_name_or_path, **kwargs) - else: + elif self._model_name_or_path.startswith('efficient-speech/lite-whisper'): model = transformers.AutoModel.from_pretrained(self._model_name_or_path, **kwargs) + else: + raise ValueError( + "The model %s is not supported by the converter. " % self._model_name_or_path) tokenizer_kwargs = {} if self._trust_remote_code: diff --git a/src/layers/attention.cc b/src/layers/attention.cc index 4855ae386..526d4742b 100644 --- a/src/layers/attention.cc +++ b/src/layers/attention.cc @@ -434,7 +434,6 @@ namespace ctranslate2 { split_heads(fused_proj, 3 * _num_heads, queries_padder); ops::Split(1)(fused_proj, queries_proj, keys_proj, values_proj); } else { - // No GQA or multi-query attention means each head has the same size. split_heads(queries_proj, _num_heads, queries_padder); split_heads(keys_proj, _num_heads_kv, queries_padder); split_heads(values_proj, _num_heads_kv, queries_padder); From e6a83fc5b48e4b9e61835072528562577908a5f4 Mon Sep 17 00:00:00 2001 From: Eyoel Gebre Date: Sun, 4 May 2025 02:01:16 +0000 Subject: [PATCH 62/64] error handling --- src/layers/common.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/layers/common.cc b/src/layers/common.cc index 01918cbdf..9b8d7c3d9 100644 --- a/src/layers/common.cc +++ b/src/layers/common.cc @@ -353,6 +353,8 @@ namespace ctranslate2 { void Dense::operator()(const StorageView& input, StorageView& output) const { PROFILE("Dense"); + if (_is_low_rank && !_partial_weight.empty()) + throw std::runtime_error("Low rank dense layer does not support partial weights"); const StorageView* qscale = _partial_qscale.empty() ? _qscale : &_partial_qscale; const StorageView* weight = _partial_weight.empty() ? &_weight : &_partial_weight; const StorageView* weight2 = _is_low_rank ? _weight2 : nullptr; From 76c0bb43df40c99a0094db9e472a48567ba12dce Mon Sep 17 00:00:00 2001 From: Eyoel Gebre Date: Sun, 4 May 2025 02:05:09 +0000 Subject: [PATCH 63/64] minor --- .gitignore | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 4348bc598..bd54a91c1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.pyc /.vs +.vscode /build CMake*.json @@ -10,5 +11,3 @@ python/dist/ .cache docs/build/ docs/python/ - -.vscode From 132258ece4a64abfd0b646aa6b501d9329d6dcab Mon Sep 17 00:00:00 2001 From: Keisuke Kamahori Date: Tue, 19 Aug 2025 03:06:49 +0000 Subject: [PATCH 64/64] fix: issues with lite-whisper models --- include/ctranslate2/layers/common.h | 1 + python/ctranslate2/converters/transformers.py | 111 ++++++++++++------ python/ctranslate2/specs/attention_spec.py | 9 +- python/ctranslate2/specs/common_spec.py | 24 ++-- python/ctranslate2/specs/transformer_spec.py | 7 +- python/ctranslate2/specs/whisper_spec.py | 3 +- src/layers/attention.cc | 18 +-- src/layers/attention_layer.cc | 20 ++-- src/layers/common.cc | 38 +++--- 9 files changed, 141 insertions(+), 90 deletions(-) diff --git a/include/ctranslate2/layers/common.h b/include/ctranslate2/layers/common.h index b6d8a29bd..fcd7e8114 100644 --- a/include/ctranslate2/layers/common.h +++ b/include/ctranslate2/layers/common.h @@ -150,6 +150,7 @@ namespace ctranslate2 { const models::QUANTIZATION_TYPE _quant_method; const bool _quantized_gemm; const ops::Gemm _gemm_op; + const ops::Gemm _gemm_op_low_rank; const ops::Quantize _quantize_op; const ops::Dequantize _dequantize_op; const ops::ActivationType* _activation_type; diff --git a/python/ctranslate2/converters/transformers.py b/python/ctranslate2/converters/transformers.py index 9c7095d7b..0c2e121fc 100644 --- a/python/ctranslate2/converters/transformers.py +++ b/python/ctranslate2/converters/transformers.py @@ -3,7 +3,6 @@ import gc import itertools import os -import re from typing import List, Optional @@ -97,13 +96,6 @@ def __init__( trust_remote_code: Allow converting models using custom code. """ self._model_name_or_path = model_name_or_path - self._model_processor_name = model_name_or_path - if model_name_or_path.startswith('efficient-speech/lite-whisper'): - # If this is a lite-whisper model, use openai's - # corresponding preprocessor. - regex = r'whisper-[a-z0-9-]+?(?=-(?:fast|acc)|$)' - regex_result = re.search(regex, model_name_or_path) - self._model_processor_name = f"openai/{regex_result.group()}" self._activation_scales = activation_scales self._copy_files = copy_files self._load_as_float16 = load_as_float16 @@ -127,6 +119,14 @@ def _load(self): % (config_name, ", ".join(sorted(_MODEL_LOADERS.keys()))) ) + # If lite whisper use corresponding openai tokenizer + if config.model_type == "lite-whisper": + base_name = self._model_name_or_path.split("/")[-1] # e.g., "lite-whisper-large-v3" + base_name = base_name.replace("lite-", "") # e.g., "whisper-large-v3" + tokenizer_path = f"openai/{base_name}" + else: + tokenizer_path = self._model_name_or_path + tokenizer_class = transformers.AutoTokenizer kwargs = { @@ -147,18 +147,15 @@ def _load(self): if hasattr(transformers, loader.architecture_name): model_class = getattr(transformers, loader.architecture_name) model = self.load_model(model_class, self._model_name_or_path, **kwargs) - elif self._model_name_or_path.startswith('efficient-speech/lite-whisper'): - model = transformers.AutoModel.from_pretrained(self._model_name_or_path, **kwargs) else: - raise ValueError( - "The model %s is not supported by the converter. " % self._model_name_or_path) + model = transformers.AutoModel.from_pretrained(self._model_name_or_path, **kwargs) tokenizer_kwargs = {} if self._trust_remote_code: tokenizer_kwargs["trust_remote_code"] = self._trust_remote_code tokenizer = self.load_tokenizer( - tokenizer_class, self._model_processor_name, **tokenizer_kwargs + tokenizer_class, tokenizer_path, **tokenizer_kwargs ) spec = loader(model, tokenizer) @@ -251,19 +248,6 @@ def set_linear(self, spec, module, quant_type=common_spec.Quantization.CT2): spec.weight = spec.weight.transpose(0, 1) if module.bias is not None: spec.bias = module.bias - - def set_low_rank_linear(self, spec, module, quant_type=common_spec.Quantization.CT2): - if quant_type == common_spec.Quantization.CT2: - spec.low_rank_weight_1 = module.weight1 - spec.low_rank_weight_2 = module.weight2 - else: - spec.low_rank_weight_1 = module.qweight1 - spec.low_rank_weight_2 = module.qweight2 - spec.weight_scale = module.scales - spec.weight_zero = module.qzeros - - if module.bias is not None: - spec.bias = module.bias def set_embeddings(self, spec, module): spec.weight = module.weight @@ -1044,10 +1028,45 @@ def get_model_spec(self, model): return spec + + def set_config(self, config, model, tokenizer): + gen_config = getattr(model, "generation_config", None) + + if gen_config is not None: + config.suppress_ids = gen_config.suppress_tokens + config.suppress_ids_begin = gen_config.begin_suppress_tokens + if hasattr(gen_config, "alignment_heads"): + config.alignment_heads = gen_config.alignment_heads + if hasattr(gen_config, "lang_to_id"): + config.lang_ids = sorted(gen_config.lang_to_id.values()) + else: + config.suppress_ids = model.config.suppress_tokens + config.suppress_ids_begin = model.config.begin_suppress_tokens + config.alignment_heads = _WHISPER_ALIGNMENT_HEADS.get(model.name_or_path) + + if getattr(config, "lang_ids", None) is None: + config.lang_ids = self._get_lang_ids_from_tokenizer(tokenizer) + + if config.alignment_heads is None: + config.alignment_heads = _WHISPER_ALIGNMENT_HEADS.get(model.name_or_path) + if config.alignment_heads is None: + # Use the last half layers for alignment by default. + num_layers = model.config.decoder_layers + num_heads = model.config.decoder_attention_heads + config.alignment_heads = list( + itertools.product( + range(num_layers // 2, num_layers), + range(num_heads), + ) + ) + def set_encoder(self, spec, encoder): + """ + Override encoder mapping for LiteWhisper. + """ self.set_conv1d(spec.conv1, encoder.conv1) self.set_conv1d(spec.conv2, encoder.conv2) - + self.set_common_layers(spec, encoder) for layer_spec, layer in zip(spec.layer, encoder.layers): @@ -1060,15 +1079,15 @@ def set_encoder(self, spec, encoder): layer.self_attn_layer_norm, ) - # Double check if these are low rank or not because of potential - # fall backs to full precision. - if hasattr(layer.fc1, 'weight1'): + if hasattr(layer.fc1, "weight1"): + # low rank self.set_low_rank_linear(layer_spec.ffn.linear_0, layer.fc1) else: layer_spec.ffn.linear_0 = common_spec.LinearSpec() self.set_linear(layer_spec.ffn.linear_0, layer.fc1) - - if hasattr(layer.fc2, 'weight1'): + + if hasattr(layer.fc2, "weight1"): + # low rank self.set_low_rank_linear(layer_spec.ffn.linear_1, layer.fc2) else: layer_spec.ffn.linear_1 = common_spec.LinearSpec() @@ -1076,13 +1095,26 @@ def set_encoder(self, spec, encoder): self.set_layer_norm(layer_spec.ffn.layer_norm, layer.final_layer_norm) + def set_low_rank_linear(self, spec, module, quant_type=common_spec.Quantization.CT2): + if quant_type == common_spec.Quantization.CT2: + spec.low_rank_weight_1 = module.weight1.transpose(0, 1).contiguous() + spec.low_rank_weight_2 = module.weight2.transpose(0, 1).contiguous() + else: + spec.low_rank_weight_1 = module.qweight1.transpose(0, 1).contiguous() + spec.low_rank_weight_2 = module.qweight2.transpose(0, 1).contiguous() + spec.weight_scale = module.scales + spec.weight_zero = module.qzeros + + if module.bias is not None: + spec.bias = module.bias + def set_low_rank_or_linear_router(self, spec, module, i): if hasattr(module, "weight1"): self.set_low_rank_linear(spec.linear[i], module) else: spec.linear[i] = common_spec.LinearSpec() self.set_linear(spec.linear[i], module) - + def set_low_rank_attention(self, spec, attention): self.set_low_rank_or_linear_router(spec, attention.q_proj, 0) self.set_low_rank_or_linear_router(spec, attention.k_proj, 1) @@ -3000,6 +3032,7 @@ def main(): (3, 4), ], "openai/whisper-tiny": [(2, 2), (3, 0), (3, 2), (3, 3), (3, 4), (3, 5)], + "efficient-speech/whisper-tiny": [(2, 2), (3, 0), (3, 2), (3, 3), (3, 4), (3, 5)], "openai/whisper-base.en": [(3, 3), (4, 7), (5, 1), (5, 5), (5, 7)], "openai/whisper-base": [ (3, 1), @@ -3113,4 +3146,16 @@ def main(): (24, 1), (25, 6), ], + "efficient-speech/whisper-large-v3": [ + (7, 0), + (10, 17), + (12, 18), + (13, 12), + (16, 1), + (17, 14), + (19, 11), + (21, 4), + (24, 1), + (25, 6), + ], } diff --git a/python/ctranslate2/specs/attention_spec.py b/python/ctranslate2/specs/attention_spec.py index 1e90c2246..2d61ad8b1 100644 --- a/python/ctranslate2/specs/attention_spec.py +++ b/python/ctranslate2/specs/attention_spec.py @@ -37,12 +37,9 @@ def __init__( self.queries_scale = model_spec.OPTIONAL self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm) - if low_rank: - self.linear = [common_spec.LowRankLinearSpec() for _ in range(4)] - else: - self.linear = [ - common_spec.LinearSpec() for _ in range(2 if self_attention else 3) - ] + linear_cls = common_spec.LinearLowRankSpec if low_rank else common_spec.LinearSpec + count = 4 if low_rank else (2 if self_attention else 3) + self.linear = [linear_cls() for _ in range(count)] if relative_position: self.relative_position_keys = None diff --git a/python/ctranslate2/specs/common_spec.py b/python/ctranslate2/specs/common_spec.py index 4233d5fd7..4209e41da 100644 --- a/python/ctranslate2/specs/common_spec.py +++ b/python/ctranslate2/specs/common_spec.py @@ -51,18 +51,6 @@ def __init__(self): def has_bias(self): return not isinstance(self.bias, str) -class LowRankLinearSpec(model_spec.LayerSpec): - def __init__(self): - super().__init__() - self.low_rank_weight_1 = None - self.low_rank_weight_2 = None - self.weight_scale = model_spec.OPTIONAL - self.weight_zero = model_spec.OPTIONAL - self.bias = model_spec.OPTIONAL - - def has_bias(self): - return not isinstance(self.bias, str) - class Conv1DSpec(model_spec.LayerSpec): def __init__(self): @@ -76,3 +64,15 @@ def __init__(self): self.weight = None self.weight_scale = model_spec.OPTIONAL self.multiply_by_sqrt_depth = model_spec.OPTIONAL + + +class LinearLowRankSpec(model_spec.LayerSpec): + def __init__(self): + self.low_rank_weight_1 = None + self.low_rank_weight_2 = None + self.weight_scale = model_spec.OPTIONAL + self.weight_zero = model_spec.OPTIONAL + self.bias = model_spec.OPTIONAL + + def has_bias(self): + return not isinstance(self.bias, str) \ No newline at end of file diff --git a/python/ctranslate2/specs/transformer_spec.py b/python/ctranslate2/specs/transformer_spec.py index 4be7e9466..f3f789242 100644 --- a/python/ctranslate2/specs/transformer_spec.py +++ b/python/ctranslate2/specs/transformer_spec.py @@ -253,7 +253,7 @@ def __init__( rms_norm=False, num_heads_kv=None, sliding_window=None, - low_rank=False + low_rank=False, ): self.self_attention = attention_spec.MultiHeadAttentionSpec( self_attention=True, @@ -344,8 +344,9 @@ def __init__( class FeedForwardSpec(model_spec.LayerSpec): def __init__(self, glu=False, rms_norm=False, low_rank=False): self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm) - self.linear_0 = common_spec.LinearSpec() if not low_rank else common_spec.LowRankLinearSpec() - self.linear_1 = common_spec.LinearSpec() if not low_rank else common_spec.LowRankLinearSpec() + linear_cls = common_spec.LinearLowRankSpec if low_rank else common_spec.LinearSpec + self.linear_0 = linear_cls() + self.linear_1 = linear_cls() if glu: self.linear_0_noact = common_spec.LinearSpec() diff --git a/python/ctranslate2/specs/whisper_spec.py b/python/ctranslate2/specs/whisper_spec.py index a04909d3e..d074c1d7b 100644 --- a/python/ctranslate2/specs/whisper_spec.py +++ b/python/ctranslate2/specs/whisper_spec.py @@ -41,9 +41,10 @@ def __init__( num_encoder_heads: The number of encoder attention heads. num_decoder_layers: The number of decoder layers. num_decoder_heads: The number of decoder attention heads. + low_rank: Whether to use lite whisper model or not. """ super().__init__() - self.encoder = WhisperEncoderSpec(num_encoder_layers, num_encoder_heads, low_rank) + self.encoder = WhisperEncoderSpec(num_encoder_layers, num_encoder_heads, low_rank=low_rank) self.decoder = transformer_spec.TransformerDecoderSpec( num_decoder_layers, num_decoder_heads, diff --git a/src/layers/attention.cc b/src/layers/attention.cc index 526d4742b..005440c2f 100644 --- a/src/layers/attention.cc +++ b/src/layers/attention.cc @@ -360,13 +360,12 @@ namespace ctranslate2 { q = &queries_proj; } - if (!_is_low_rank) { - _linear[0](*q, fused_proj); - } else { - // Low-rank attention does not fuse qkv. - _linear[0](*q, queries_proj); + _linear[0](*q, fused_proj); + + if (_is_low_rank) { // support low-rank _linear[1](*q, keys_proj); _linear[2](*q, values_proj); + queries_proj = std::move(fused_proj); } dim_t beam_size = 1; @@ -375,7 +374,7 @@ namespace ctranslate2 { if (!_self_attention) { if (_is_low_rank) - throw std::invalid_argument("MultiHeadAttention does not support low-rank attention with cross-attention"); + throw std::invalid_argument("lite whisper doesn't use low-rank for cross-attention"); queries_proj = std::move(fused_proj); if (cached_keys == nullptr || cached_keys->empty()) { @@ -411,7 +410,7 @@ namespace ctranslate2 { if (_num_heads_kv < _num_heads) { if (_is_low_rank) - throw std::invalid_argument("MutliHeadAttention does not support low-rank attention with multi-query or GQA"); + throw std::invalid_argument("lite whisper doesn't use low-rank for multi-query or GQA"); if (queries_padder) queries_padder->add_padding(fused_proj); @@ -430,10 +429,11 @@ namespace ctranslate2 { } } else { - if (!_is_low_rank) { + if (!_is_low_rank){ split_heads(fused_proj, 3 * _num_heads, queries_padder); ops::Split(1)(fused_proj, queries_proj, keys_proj, values_proj); - } else { + } + else{ split_heads(queries_proj, _num_heads, queries_padder); split_heads(keys_proj, _num_heads_kv, queries_padder); split_heads(values_proj, _num_heads_kv, queries_padder); diff --git a/src/layers/attention_layer.cc b/src/layers/attention_layer.cc index 38064656e..9e91b8e1e 100644 --- a/src/layers/attention_layer.cc +++ b/src/layers/attention_layer.cc @@ -52,10 +52,16 @@ namespace ctranslate2 { } static bool set_low_rank(const models::Model& model, const std::string& scope) { - const StorageView* low_rank_weight = model.get_variable_if_exists(scope + "/linear_0/low_rank_weight_1"); - if (low_rank_weight) { - return true; + const dim_t max_layers = 4; + for (int i = 0; i < max_layers; ++i) { + std::string prefix = scope + "/linear_" + std::to_string(i); + const StorageView* w1 = model.get_variable_if_exists(prefix + "/low_rank_weight_1"); + const StorageView* w2 = model.get_variable_if_exists(prefix + "/low_rank_weight_2"); + if (w1 && w2) { + return true; + } } + // If no low-rank pair is found, then it is not low-rank return false; } @@ -63,13 +69,7 @@ namespace ctranslate2 { const std::string& scope, bool self_attention, bool _is_low_rank) { - dim_t num_linear_layers; - if (!_is_low_rank) { - num_linear_layers = self_attention ? 2 : 3; - } else { - num_linear_layers = 4; - } - + const dim_t num_linear_layers = !_is_low_rank ? (self_attention ? 2 : 3) : 4; std::vector layers; layers.reserve(num_linear_layers); for (dim_t i = 0; i < num_linear_layers; ++i) diff --git a/src/layers/common.cc b/src/layers/common.cc index 9b8d7c3d9..465a164b3 100644 --- a/src/layers/common.cc +++ b/src/layers/common.cc @@ -250,12 +250,8 @@ namespace ctranslate2 { return _encoding.dim(1); } - static bool set_low_rank(const models::Model& model, const std::string& scope) { - const StorageView* low_rank_weight = model.get_variable_if_exists(scope + "/low_rank_weight_1"); - if (low_rank_weight) { - return true; - } - return false; + static bool has_low_rank(const models::Model& model, const std::string& scope) { + return model.get_variable_if_exists(scope + "/low_rank_weight_1") != nullptr; } static const StorageView& get_linear_weight(const models::Model& model, @@ -275,7 +271,7 @@ namespace ctranslate2 { const ops::ActivationType* activation_type, const bool is_layer_out) : _packed_weight(false) - , _is_low_rank(set_low_rank(model, scope)) + , _is_low_rank(has_low_rank(model, scope)) , _weight(_is_low_rank ? *model.get_variable_if_exists(scope + "/low_rank_weight_1") : get_linear_weight(model, scope, &_packed_weight)) , _weight2(_is_low_rank ? model.get_variable_if_exists(scope + "/low_rank_weight_2") : nullptr) , _bias(model.get_variable_if_exists(scope + "/bias")) @@ -296,10 +292,17 @@ namespace ctranslate2 { , _gemm_op(/*alpha=*/1, /*beta=*/0, /*trans_a=*/false, - /*trans_b=*/ _is_low_rank ? false : true, + /*trans_b=*/true, /*a_is_packed=*/false, _packed_weight, _quantized_gemm ? nullptr : activation_type) + , _gemm_op_low_rank(/*alpha=*/1, + /*beta=*/0, + /*trans_a=*/false, + /*trans_b=*/true, + /*a_is_packed=*/false, + /*packaged_weight=*/false, + /*activation_type=*/ nullptr) , _quantize_op(model.use_global_int16_scale() ? ops::Quantize::ScaleType::GLOBAL : ops::Quantize::ScaleType::PER_LAYER, @@ -319,8 +322,7 @@ namespace ctranslate2 { if (_is_low_rank) { if (_partial_weight) throw std::runtime_error("Low rank dense layer does not support partial weights"); - // weight is transposed when low_rank - return _weight2->dim(1); + return _weight2->dim(0); } return _partial_weight ? _partial_weight.dim(0) : _weight.dim(0); } @@ -368,7 +370,7 @@ namespace ctranslate2 { bias = nullptr; if (_quantized_gemm) { if (_is_low_rank) - throw std::runtime_error("Low rank dense layer not supported with quantized gemm"); + throw std::runtime_error("Low rank dense layer is not supported with quantized gemm"); const auto device = input.device(); StorageView qinput(_weight.dtype(), device); StorageView qinput_scale(_qscale->dtype(), device); @@ -417,7 +419,7 @@ namespace ctranslate2 { bias); } else if (_qzero && _qscale) { if (_is_low_rank) - throw std::runtime_error("Low rank dense layer not supported with quantized gemm"); + throw std::runtime_error("Low rank dense layer is not supported with quantized gemm"); switch (_quant_method) { case models::QUANTIZATION_TYPE::AWQ_GEMM: if (input.dim(0) * input.dim(1) >= 1024) { @@ -450,12 +452,16 @@ namespace ctranslate2 { "support only ct2 and awq quantization"); } } else { - if (!_is_low_rank) { + if(!_is_low_rank) { _gemm_op(input, *weight, output, nullptr, bias); } else { - StorageView& intermediate_output = output; - _gemm_op(input, *weight, intermediate_output, nullptr); - _gemm_op(intermediate_output, *weight2, output, nullptr, bias); + StorageView intermediate_output(input.device(), input.dtype()); + + // First multiplication: input [M,K] * weight^T [K,R] + _gemm_op_low_rank(input, *weight, intermediate_output, nullptr); + + // Second multiplication: intermediate [M,R] * weight2^T [R,N] + _gemm_op(intermediate_output, *weight2, output, nullptr, bias); } } }