|
10 | 10 | from vllm.core.interfaces import AllocStatus |
11 | 11 | from vllm.core.scheduler import Scheduler, SchedulingBudget |
12 | 12 | from vllm.lora.request import LoRARequest |
13 | | -from vllm.sequence import SequenceGroup, SequenceStatus |
| 13 | +from vllm.sequence import SequenceGroup |
14 | 14 |
|
15 | 15 | from .utils import (append_new_token, append_new_token_seq_group, |
16 | 16 | create_dummy_prompt, get_sequence_groups, |
@@ -296,55 +296,6 @@ def test_scheduler_delay_factor(): |
296 | 296 | append_new_token(out, 1) |
297 | 297 |
|
298 | 298 |
|
299 | | -def test_swapped_out_prioritized(): |
300 | | - block_size = 4 |
301 | | - scheduler = initialize_scheduler(max_num_seqs=6, |
302 | | - block_size=block_size, |
303 | | - num_cpu_blocks=64, |
304 | | - num_gpu_blocks=64) |
305 | | - # best_of=2 * 3 == 6 sequences. |
306 | | - for i in range(3): |
307 | | - _, seq_group = create_dummy_prompt(str(i), |
308 | | - prompt_length=60, |
309 | | - best_of=2, |
310 | | - block_size=block_size) |
311 | | - scheduler.add_seq_group(seq_group) |
312 | | - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) |
313 | | - # prefill scheduled now. |
314 | | - assert len(out.scheduled_seq_groups) == 3 |
315 | | - append_new_token(out, 1) |
316 | | - |
317 | | - # The last request should be swapped out. |
318 | | - scheduler.block_manager.can_append_slots = MagicMock() |
319 | | - |
320 | | - def cannot_append_second_group(seq_group, num_lookahead_slots): |
321 | | - return seq_group.request_id != "2" |
322 | | - |
323 | | - scheduler.block_manager.can_append_slots.side_effect = ( |
324 | | - cannot_append_second_group) |
325 | | - |
326 | | - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) |
327 | | - assert len(out.scheduled_seq_groups) == 2 |
328 | | - assert out.num_batched_tokens == 2 |
329 | | - assert out.blocks_to_swap_out != [] |
330 | | - assert out.blocks_to_swap_in == [] |
331 | | - append_new_token(out, 1) |
332 | | - |
333 | | - # Add 1 more task. Swap should be prioritized over prefill. |
334 | | - _, seq_group = create_dummy_prompt(str(i), |
335 | | - prompt_length=60, |
336 | | - best_of=2, |
337 | | - block_size=block_size) |
338 | | - scheduler.add_seq_group(seq_group) |
339 | | - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) |
340 | | - append_new_token(out, 1) |
341 | | - assert len(out.scheduled_seq_groups) == 3 |
342 | | - # 3 decodes. It is swapped in. |
343 | | - assert out.num_batched_tokens == 3 |
344 | | - assert out.blocks_to_swap_in != [] |
345 | | - assert out.blocks_to_swap_out == [] |
346 | | - |
347 | | - |
348 | 299 | def initialize_scheduler( |
349 | 300 | *, |
350 | 301 | max_num_seqs=1000, |
@@ -646,60 +597,6 @@ def cannot_append_second_group(seq_group, num_lookahead_slots): |
646 | 597 | assert output.blocks_to_copy == [] |
647 | 598 |
|
648 | 599 |
|
649 | | -def test_decode_swap_beam_search(): |
650 | | - """ |
651 | | - Test best_of > 1 swap out blocks |
652 | | - """ |
653 | | - block_size = 4 |
654 | | - scheduler = initialize_scheduler(block_size=block_size, |
655 | | - num_gpu_blocks=64, |
656 | | - num_cpu_blocks=64) |
657 | | - curr_loras = None |
658 | | - budget = create_token_budget() |
659 | | - for i in range(3): |
660 | | - _, seq_group = create_dummy_prompt(str(i), |
661 | | - prompt_length=60, |
662 | | - best_of=2, |
663 | | - block_size=block_size) |
664 | | - scheduler._allocate_and_set_running(seq_group) |
665 | | - scheduler._add_seq_group_to_running(seq_group) |
666 | | - append_new_token_seq_group(60, seq_group, 1) |
667 | | - budget.add_num_seqs(seq_group.request_id, |
668 | | - seq_group.get_max_num_running_seqs()) |
669 | | - budget.add_num_batched_tokens( |
670 | | - seq_group.request_id, seq_group.num_seqs(SequenceStatus.RUNNING)) |
671 | | - |
672 | | - # The last request should be swapped out. |
673 | | - scheduler.block_manager.can_append_slots = MagicMock() |
674 | | - |
675 | | - def cannot_append_second_group(seq_group, num_lookahead_slots): |
676 | | - return seq_group.request_id != "2" |
677 | | - |
678 | | - scheduler.block_manager.can_append_slots.side_effect = ( |
679 | | - cannot_append_second_group) |
680 | | - scheduler.block_manager.swap_out = MagicMock() |
681 | | - expected_swap_mapping = [("5", "7")] |
682 | | - scheduler.block_manager.swap_out.return_value = expected_swap_mapping |
683 | | - |
684 | | - output = scheduler._schedule_running(budget, curr_loras) |
685 | | - remainig_running = scheduler.running |
686 | | - assert len(remainig_running) == 0 |
687 | | - assert len(output.decode_seq_groups) == 2 |
688 | | - assert len(output.prefill_seq_groups) == 0 |
689 | | - assert output.decode_seq_groups[0].seq_group.request_id == "0" |
690 | | - assert output.decode_seq_groups[1].seq_group.request_id == "1" |
691 | | - assert len(output.preempted) == 0 |
692 | | - assert len(output.swapped_out) == 1 |
693 | | - # Budget should refledct preempted requests. |
694 | | - assert budget.num_batched_tokens == 2 |
695 | | - # since there are 2 sequences, 2 should be subtracted. |
696 | | - assert budget.num_curr_seqs == 4 |
697 | | - # Both should be preempted, not swapped. |
698 | | - assert output.blocks_to_swap_out == expected_swap_mapping |
699 | | - # Nothing is copied. |
700 | | - assert output.blocks_to_copy == [] |
701 | | - |
702 | | - |
703 | 600 | def test_schedule_decode_blocks_to_copy_update(): |
704 | 601 | """ |
705 | 602 | Verify blocks_to_copy is updated. |
@@ -736,105 +633,6 @@ def test_schedule_decode_blocks_to_copy_update(): |
736 | 633 | assert output.blocks_to_copy == [(2, 3)] |
737 | 634 |
|
738 | 635 |
|
739 | | -def test_schedule_swapped_simple(): |
740 | | - block_size = 4 |
741 | | - scheduler = initialize_scheduler(block_size=block_size) |
742 | | - curr_loras = None |
743 | | - blocks_to_swap_out: List[Tuple[int, int]] = [] |
744 | | - _, seq_group = create_dummy_prompt("1", |
745 | | - prompt_length=4, |
746 | | - best_of=2, |
747 | | - block_size=block_size) |
748 | | - scheduler._allocate_and_set_running(seq_group) |
749 | | - append_new_token_seq_group(4, seq_group, 1) |
750 | | - scheduler._swap_out(seq_group, blocks_to_swap_out) |
751 | | - scheduler._add_seq_group_to_swapped(seq_group) |
752 | | - |
753 | | - budget = create_token_budget() |
754 | | - output = scheduler._schedule_swapped(budget, curr_loras) |
755 | | - remaining_swapped = scheduler.swapped |
756 | | - assert len(remaining_swapped) == 0 |
757 | | - assert budget.num_batched_tokens == 1 |
758 | | - assert budget.num_curr_seqs == 2 |
759 | | - assert len(output.decode_seq_groups) == 1 |
760 | | - assert len(output.prefill_seq_groups) == 0 |
761 | | - # swap in is the reverse of swap out |
762 | | - blocks_to_swap_in_reverse = [] |
763 | | - for swapin, swapout in output.blocks_to_swap_in: |
764 | | - blocks_to_swap_in_reverse.append((swapout, swapin)) |
765 | | - assert blocks_to_swap_out == blocks_to_swap_in_reverse |
766 | | - |
767 | | - |
768 | | -def test_schedule_swapped_max_token_budget(): |
769 | | - block_size = 4 |
770 | | - scheduler = initialize_scheduler(block_size=block_size, |
771 | | - num_cpu_blocks=32, |
772 | | - num_gpu_blocks=32) |
773 | | - curr_loras = None |
774 | | - blocks_to_swap_out: List[Tuple[int, int]] = [] |
775 | | - for i in range(2): |
776 | | - _, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2) |
777 | | - scheduler._allocate_and_set_running(seq_group) |
778 | | - append_new_token_seq_group(60, seq_group, 1) |
779 | | - scheduler._swap_out(seq_group, blocks_to_swap_out) |
780 | | - scheduler._add_seq_group_to_swapped(seq_group) |
781 | | - |
782 | | - budget = create_token_budget(token_budget=1) |
783 | | - output = scheduler._schedule_swapped(budget, curr_loras) |
784 | | - remaining_swapped = scheduler.swapped |
785 | | - assert len(remaining_swapped) == 1 |
786 | | - assert budget.num_batched_tokens == 1 |
787 | | - assert budget.num_curr_seqs == 2 |
788 | | - assert len(output.decode_seq_groups) == 1 |
789 | | - assert len(output.prefill_seq_groups) == 0 |
790 | | - |
791 | | - # Verify num_batched_tokens are respected. |
792 | | - budget = create_token_budget(token_budget=1) |
793 | | - add_token_budget(budget, 1, 0) |
794 | | - output = scheduler._schedule_swapped(budget, curr_loras) |
795 | | - remaining_swapped = scheduler.swapped |
796 | | - assert len(remaining_swapped) == 1 |
797 | | - assert budget.num_batched_tokens == 1 |
798 | | - assert budget.num_curr_seqs == 0 |
799 | | - assert len(output.decode_seq_groups) == 0 |
800 | | - assert len(output.prefill_seq_groups) == 0 |
801 | | - |
802 | | - |
803 | | -def test_schedule_swapped_max_seqs(): |
804 | | - block_size = 4 |
805 | | - scheduler = initialize_scheduler(block_size=block_size, |
806 | | - num_cpu_blocks=64, |
807 | | - num_gpu_blocks=64) |
808 | | - curr_loras = None |
809 | | - blocks_to_swap_out: List[Tuple[int, int]] = [] |
810 | | - for i in range(4): |
811 | | - _, seq_group = create_dummy_prompt(str(i), |
812 | | - prompt_length=60, |
813 | | - block_size=4) |
814 | | - scheduler._allocate_and_set_running(seq_group) |
815 | | - append_new_token_seq_group(60, seq_group, 1) |
816 | | - scheduler._swap_out(seq_group, blocks_to_swap_out) |
817 | | - scheduler._add_seq_group_to_swapped(seq_group) |
818 | | - |
819 | | - budget = create_token_budget(max_num_seqs=2) |
820 | | - output = scheduler._schedule_swapped(budget, curr_loras) |
821 | | - remaining_swapped = scheduler.swapped |
822 | | - assert len(remaining_swapped) == 2 |
823 | | - assert budget.num_batched_tokens == 2 |
824 | | - assert budget.num_curr_seqs == 2 |
825 | | - assert len(output.decode_seq_groups) == 2 |
826 | | - assert len(output.prefill_seq_groups) == 0 |
827 | | - |
828 | | - # Verify num_curr_seqs are respected. |
829 | | - output = scheduler._schedule_swapped(budget, curr_loras) |
830 | | - remaining_swapped = scheduler.swapped |
831 | | - assert len(remaining_swapped) == 2 |
832 | | - assert budget.num_batched_tokens == 2 |
833 | | - assert budget.num_curr_seqs == 2 |
834 | | - assert len(output.decode_seq_groups) == 0 |
835 | | - assert len(output.prefill_seq_groups) == 0 |
836 | | - |
837 | | - |
838 | 636 | def test_schedule_swapped_max_loras(): |
839 | 637 | block_size = 4 |
840 | 638 | lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) |
|
0 commit comments