-
Notifications
You must be signed in to change notification settings - Fork 742
[KVCache] Support only flush FD GPU/CPU Cache index by AttentionStore #7609
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -828,7 +828,7 @@ def request_match_blocks(self, task: Request, block_size, *args): | |
| storage_match_token_num = 0 | ||
| match_storage_block_ids = [] | ||
|
|
||
| if self.kvcache_storage_backend and no_match_token_num >= block_size: | ||
| if self.kvcache_storage_backend and no_match_token_num >= block_size and not envs.FD_AS_ONLY_FLUSH: | ||
| if not self.can_allocate_gpu_blocks(num_blocks=no_match_block_num, try_free_gpu_blocks=False): | ||
| raise Exception( | ||
| "request_match_blocks: Not enough GPU memory to allocate cache for matched Storage Cache" | ||
|
|
@@ -1240,14 +1240,15 @@ def issue_write_back_storage_task(self, task: WriteStorageTask, is_sync=True): | |
| if self.kvcache_storage_backend is None: | ||
| return | ||
|
|
||
| if len(task.keys) != len(task.gpu_block_ids): | ||
| if not envs.FD_AS_ONLY_FLUSH and len(task.keys) != len(task.gpu_block_ids): | ||
| err_msg = ( | ||
| f"write_back_storage error: hash_keys({len(task.keys)}) != gpu_block_ids({len(task.gpu_block_ids)})" | ||
| ) | ||
| logger.error(err_msg) | ||
| raise ValueError(err_msg) | ||
|
|
||
| self.task_write_back_event[task.task_id] = Event() | ||
| if is_sync: | ||
| self.task_write_back_event[task.task_id] = Event() | ||
| self.cache_task_queue.put_transfer_task((CacheStatus.GPU2STORAGE, task)) | ||
| if is_sync: | ||
| self.wait_write_storage_task(task.task_id) | ||
|
|
@@ -1536,6 +1537,7 @@ def free_cpu_block_ids(self, need_block_num): | |
| - freed_block_num: Number of CPU blocks successfully evicted | ||
| """ | ||
| hash_value_block_ids_map = defaultdict(list) | ||
| hash_value_flush_info = {} # {input_hash_value: (token_ids, min_depth)} | ||
| total_cpu_free_count = 0 | ||
| with self.request_release_lock: | ||
| while True: | ||
|
|
@@ -1551,6 +1553,10 @@ def free_cpu_block_ids(self, need_block_num): | |
|
|
||
| self.recycle_cpu_blocks(node.block_id) | ||
| hash_value_block_ids_map[node.input_hash_value].extend(reversed(tmp_block_ids)) | ||
| if envs.FD_AS_ONLY_FLUSH and self.kvcache_storage_backend == "attention_store": | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❓ 疑问
请确认 |
||
| key = node.input_hash_value | ||
| if key not in hash_value_flush_info or node.depth < hash_value_flush_info[key][1]: | ||
| hash_value_flush_info[key] = (node.input_ids, node.depth) | ||
| logger.info(f"free_cpu_block_ids: free node {node}") | ||
|
|
||
| self.node_id_pool.append(node.node_id) | ||
|
|
@@ -1575,6 +1581,17 @@ def free_cpu_block_ids(self, need_block_num): | |
| logger.info( | ||
| "free_cpu_block_ids: after free, " + f"len(self.cpu_free_block_list) {len(self.cpu_free_block_list)}" | ||
| ) | ||
| if envs.FD_AS_ONLY_FLUSH and self.kvcache_storage_backend == "attention_store" and hash_value_flush_info: | ||
| for input_hash_value, (token_ids, min_depth) in hash_value_flush_info.items(): | ||
| flush_task = WriteStorageTask( | ||
| task_id=str(uuid.uuid4()), | ||
| keys=[input_hash_value], | ||
| token_ids=token_ids, | ||
| gpu_block_ids=[], | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 建议 建议补充注释说明 此处使用 另外, |
||
| flush_cache_exists=False, | ||
| start_write_block_idx=min_depth - 1, | ||
| ) | ||
| self.issue_write_back_storage_task(flush_task, is_sync=False) | ||
| return total_cpu_free_count | ||
|
|
||
| def get_block_hash_extra_keys(self, request, start_idx, end_idx, mm_idx): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🟡 建议
_flush_only_storage_task中reside_in_gpu语义存疑当
task.flush_cache_exists is None时,代码回退为reside_in_gpu=True,注释说明此场景是「request finish」,语义上 cache 仍在 GPU,看起来合理。但None的含义是「不适用」,若调用方并非 request-finish 场景而忘记赋值,这里会静默传入True,产生错误的索引状态。建议在此处添加日志警告,明确
None仅允许在请求完成(非 flush-only)路径下出现: