refactor: Use MergedColumnParallelLinear for Whisper cross-attention kv_proj

jcggl · jcggl · commit 22c6415d4491 · 2025-12-07T01:36:41.000+09:00
Address maintainer feedback:
- Replace QKVParallelLinear with MergedColumnParallelLinear for kv_proj
  in WhisperCrossAttention, enabling LoRA support via existing
  MergedColumnParallelLinearWithLoRA infrastructure
- Update weight loading to use integer shard indices (0, 1) instead of
  string identifiers ("k", "v") for MergedColumnParallelLinear
- Remove redundant embedding_modules and embedding_padding_modules
  attributes from WhisperForConditionalGeneration
- Remove example file (similar to existing multilora_inference.py)
- Rollback LoRA layer changes as they are no longer needed
- Update tests to reflect new architecture

Signed-off-by: daje0601 &lt;englishmt4118@gmail.com&gt;
diff --git a/examples/offline_inference/whisper_multilora_inference.py b/examples/offline_inference/whisper_multilora_inference.py
diff --git a/tests/lora/test_whisper_lora.py b/tests/lora/test_whisper_lora.py
@@ -5,17 +5,17 @@
 
 This module tests:
 1. WhisperForConditionalGeneration LoRA interface compliance
-2. MergedQKVParallelLinearWithLoRA support for KV-only (2-slice) configuration
+2. MergedColumnParallelLinearWithLoRA support for KV (2-slice) configuration
 3. WorkerLoRAManager compatibility with Whisper's max_target_positions
 """
 
 import pytest
 import torch
 
 from vllm.lora.layers import (
-    MergedQKVParallelLinearWithLoRA,
+    MergedColumnParallelLinearWithLoRA,
 )
-from vllm.model_executor.layers.linear import QKVParallelLinear
+from vllm.model_executor.layers.linear import MergedColumnParallelLinear
 from vllm.model_executor.models.whisper import WhisperForConditionalGeneration
 from vllm.platforms import current_platform
 
@@ -36,18 +36,6 @@ def test_supports_lora_attribute(self):
             "WhisperForConditionalGeneration should inherit from SupportsLoRA"
         )
 
-    def test_embedding_modules_defined(self):
-        """Verify embedding_modules attribute is defined."""
-        assert hasattr(WhisperForConditionalGeneration, "embedding_modules")
-        assert isinstance(WhisperForConditionalGeneration.embedding_modules, dict)
-
-    def test_embedding_padding_modules_defined(self):
-        """Verify embedding_padding_modules attribute is defined."""
-        assert hasattr(WhisperForConditionalGeneration, "embedding_padding_modules")
-        assert isinstance(
-            WhisperForConditionalGeneration.embedding_padding_modules, list
-        )
-
     def test_packed_modules_mapping_format(self):
         """Verify packed_modules_mapping has correct format for LoRA."""
         mapping = WhisperForConditionalGeneration.packed_modules_mapping
@@ -63,20 +51,18 @@ def test_packed_modules_mapping_format(self):
         assert mapping["kv_proj"] == ["k_proj", "v_proj"]
 
 
-class TestMergedQKVParallelLinearWithLoRAKVOnly:
-    """Test MergedQKVParallelLinearWithLoRA with KV-only (2-slice) configuration."""
+class TestMergedColumnParallelLinearWithLoRAKVOnly:
+    """Test MergedColumnParallelLinearWithLoRA with KV (2-slice) configuration."""
 
     def test_can_replace_layer_accepts_2_modules(self):
-        """Verify can_replace_layer accepts 2-module (KV-only) configurations."""
+        """Verify can_replace_layer accepts 2-module (KV) configurations."""
         from vllm.config.lora import LoRAConfig
 
-        # Create a mock QKVParallelLinear layer
-        # This simulates a KV-only projection (like Whisper's encoder_attn.kv_proj)
-        linear = QKVParallelLinear(
-            hidden_size=512,
-            head_size=64,
-            total_num_heads=8,
-            total_num_kv_heads=8,
+        # Create a MergedColumnParallelLinear layer
+        # This simulates a KV projection (like Whisper's encoder_attn.kv_proj)
+        linear = MergedColumnParallelLinear(
+            input_size=512,
+            output_sizes=[512, 512],  # K and V projections
             bias=False,
             params_dtype=torch.float16,
         )
@@ -88,29 +74,19 @@ def test_can_replace_layer_accepts_2_modules(self):
             lora_extra_vocab_size=0,
         )
 
-        # Test with 2 modules (KV-only, like encoder_attn.kv_proj)
+        # Test with 2 modules (KV, like encoder_attn.kv_proj)
         packed_modules_2 = ["k_proj", "v_proj"]
-        result_2 = MergedQKVParallelLinearWithLoRA.can_replace_layer(
+        result_2 = MergedColumnParallelLinearWithLoRA.can_replace_layer(
             source_layer=linear,
             lora_config=lora_config,
             packed_modules_list=packed_modules_2,
             model_config=None,
         )
-        assert result_2 is True, "Should accept 2-module (KV-only) configuration"
-
-        # Test with 3 modules (QKV, like self_attn.qkv_proj)
-        packed_modules_3 = ["q_proj", "k_proj", "v_proj"]
-        result_3 = MergedQKVParallelLinearWithLoRA.can_replace_layer(
-            source_layer=linear,
-            lora_config=lora_config,
-            packed_modules_list=packed_modules_3,
-            model_config=None,
-        )
-        assert result_3 is True, "Should accept 3-module (QKV) configuration"
+        assert result_2 is True, "Should accept 2-module (KV) configuration"
 
-        # Test with 1 module (should be rejected)
-        packed_modules_1 = ["q_proj"]
-        result_1 = MergedQKVParallelLinearWithLoRA.can_replace_layer(
+        # Test with 1 module (should be rejected for MergedColumnParallelLinear)
+        packed_modules_1 = ["k_proj"]
+        result_1 = MergedColumnParallelLinearWithLoRA.can_replace_layer(
             source_layer=linear,
             lora_config=lora_config,
             packed_modules_list=packed_modules_1,
diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py
@@ -356,6 +356,8 @@ class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
 
     def __init__(self, base_layer: QKVParallelLinear) -> None:
         super().__init__(base_layer)
+        # There are three LoRA layer.
+        self.n_slices = len(self.base_layer.output_sizes)
 
         self.q_proj_shard_size = self.base_layer.num_heads * self.base_layer.head_size
         self.kv_proj_shard_size = (
@@ -364,23 +366,16 @@ def __init__(self, base_layer: QKVParallelLinear) -> None:
         self.q_shard_id = self.tp_rank
         self.kv_shard_id = self.tp_rank // self.base_layer.num_kv_head_replicas
 
-        # Build output_slices and output_ids dynamically to support both
-        # QKV (3 slices) and KV-only (2 slices) configurations.
-        # KV-only is used in cross-attention layers (e.g., Whisper encoder_attn).
-        slices = []
-        ids = []
-        if self.q_proj_shard_size > 0:
-            slices.append(self.q_proj_shard_size)
-            ids.append(self.q_shard_id)
-        if self.kv_proj_shard_size > 0:
-            slices.append(self.kv_proj_shard_size)
-            ids.append(self.kv_shard_id)
-            slices.append(self.kv_proj_shard_size)
-            ids.append(self.kv_shard_id)
-
-        self.output_slices = tuple(slices)
-        self.output_ids = tuple(ids)
-        self.n_slices = len(self.output_slices)
+        self.output_slices = (
+            self.q_proj_shard_size,
+            self.kv_proj_shard_size,
+            self.kv_proj_shard_size,
+        )
+        self.output_ids = (
+            self.q_shard_id,
+            self.kv_shard_id,
+            self.kv_shard_id,
+        )
 
     def create_lora_weights(
         self,
@@ -403,11 +398,7 @@ def can_replace_layer(
         packed_modules_list: list,
         model_config: PretrainedConfig | None = None,
     ) -> bool:
-        # Support both QKV (3 modules) and KV-only (2 modules) configurations
-        return type(source_layer) is QKVParallelLinear and len(packed_modules_list) in (
-            2,
-            3,
-        )
+        return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 3
 
 
 # These following layers are based on the tensor parallelism strategy given in
@@ -548,18 +539,21 @@ class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA):
     def slice_lora_a(
         self, lora_a: list[torch.Tensor | None]
     ) -> list[torch.Tensor | None]:
-        # NOTE: lora_a contains n_slices subloras, and each sublora could be None.
-        # n_slices is 3 for QKV and 2 for KV-only configurations.
-        shard_size = [self.lora_a_stacked[i].shape[2] for i in range(self.n_slices)]
-        start_idx = [self.tp_rank * shard_size[i] for i in range(self.n_slices)]
-        result: list[torch.Tensor | None] = []
-        for i in range(self.n_slices):
-            lora_a_i = lora_a[i]
-            if lora_a_i is not None:
-                result.append(lora_a_i[start_idx[i] : start_idx[i] + shard_size[i], :])
-            else:
-                result.append(None)
-        return result
+        # NOTE: lora_a contains 3 subloras, and each sublora could be None.
+        shard_size = [self.lora_a_stacked[i].shape[2] for i in range(3)]
+        start_idx = [self.tp_rank * shard_size[i] for i in range(3)]
+        lora_a = [
+            lora_a[0][start_idx[0] : start_idx[0] + shard_size[0], :]
+            if lora_a[0] is not None
+            else None,
+            lora_a[1][start_idx[1] : start_idx[1] + shard_size[1], :]
+            if lora_a[1] is not None
+            else None,
+            lora_a[2][start_idx[2] : start_idx[2] + shard_size[2], :]
+            if lora_a[2] is not None
+            else None,
+        ]
+        return lora_a
 
     def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         return _mcp_apply(x, bias, self)
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
@@ -27,6 +27,7 @@
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
+    MergedColumnParallelLinear,
     QKVParallelLinear,
     RowParallelLinear,
 )
@@ -323,11 +324,12 @@ def _init_qkv(
             quant_config=quant_config,
             prefix=f"{prefix}.q_proj",
         )
-        self.kv_proj = QKVParallelLinear(
-            hidden_size=embed_dim,
-            head_size=self.head_dim,
-            total_num_heads=0,
-            total_num_kv_heads=self.total_num_heads,
+        # Use MergedColumnParallelLinear for K and V projections.
+        # This enables LoRA support via MergedColumnParallelLinearWithLoRA
+        # which handles 2-slice configurations.
+        self.kv_proj = MergedColumnParallelLinear(
+            input_size=embed_dim,
+            output_sizes=[embed_dim, embed_dim],
             bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.kv_proj",
@@ -631,8 +633,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
             (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
             (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
-            (".encoder_attn.kv_proj", ".encoder_attn.k_proj", "k"),
-            (".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"),
+            # MergedColumnParallelLinear uses integer indices (0, 1)
+            (".encoder_attn.kv_proj", ".encoder_attn.k_proj", 0),
+            (".encoder_attn.kv_proj", ".encoder_attn.v_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
@@ -781,10 +784,6 @@ class WhisperForConditionalGeneration(
     nn.Module, SupportsTranscription, SupportsMultiModal, SupportsLoRA
 ):
     # LoRA-specific attributes
-    embedding_modules = {}
-    embedding_padding_modules: list[str] = []
-
-    merge_by_field_config = True
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "kv_proj": ["k_proj", "v_proj"],