-
Notifications
You must be signed in to change notification settings - Fork 742
[PD] Refine metrics and trace for pd #7613
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -955,6 +955,9 @@ def _fetch_request(): | |
| self.llm_logger.debug( | ||
| f"P has allocated resources and then ask D resource for request: {task.request_id}" | ||
| ) | ||
| trace_print( | ||
| LoggingEventName.ASK_DECODE_RESOURCE_START, task.request_id, getattr(task, "user", "") | ||
| ) | ||
| task.metrics.ask_decode_resource_start_time = time.time() | ||
| while True: | ||
| self.split_connector.send_splitwise_tasks([task], task.idx) | ||
|
|
@@ -966,6 +969,11 @@ def _fetch_request(): | |
| time.sleep(0.05) | ||
| else: | ||
| task.metrics.ask_decode_resource_finish_time = time.time() | ||
| trace_print( | ||
| LoggingEventName.ASK_DECODE_RESOURCE_END, | ||
| task.request_id, | ||
| getattr(task, "user", ""), | ||
| ) | ||
| break | ||
| self.llm_logger.debug(f"D has allocated resource for request: {task.request_id}") | ||
| else: | ||
|
|
@@ -977,13 +985,19 @@ def _fetch_request(): | |
| self.llm_logger.debug( | ||
| f"P has allocated resources and then ask D resource for req_id: {task.request_id}" | ||
| ) | ||
| trace_print( | ||
| LoggingEventName.ASK_DECODE_RESOURCE_START, task.request_id, getattr(task, "user", "") | ||
| ) | ||
| task.metrics.ask_decode_resource_start_time = time.time() | ||
| self.split_connector.send_splitwise_tasks([task], task.idx) | ||
|
|
||
| for task in tasks: | ||
| # assure fetch block ids from D | ||
| status, msg = self.split_connector.check_decode_allocated(task) | ||
| task.metrics.ask_decode_resource_finish_time = time.time() | ||
| trace_print( | ||
| LoggingEventName.ASK_DECODE_RESOURCE_END, task.request_id, getattr(task, "user", "") | ||
| ) | ||
| if not status: | ||
| error_msg = ( | ||
| f"PD Error: prefill failed to apply for resource from decode, " | ||
|
|
@@ -1000,6 +1014,7 @@ def _fetch_request(): | |
| ) | ||
| ] | ||
| ) | ||
| main_process_metrics.reschedule_req_num.inc() | ||
| need_delete_tasks.append(task) | ||
| continue | ||
| for tmp_task in need_delete_tasks: | ||
|
|
@@ -1110,6 +1125,7 @@ def _fetch_request(): | |
| f"preallocated request. req:{task.request_id} " | ||
| ) | ||
| self.llm_logger.error(msg) | ||
| main_process_metrics.reschedule_req_num.inc() | ||
| self.scheduler.put_results( | ||
| [ | ||
| RequestOutput( | ||
|
|
@@ -2109,6 +2125,7 @@ def _process_allocate_resource_requests(): | |
| processed_indices = [] | ||
| for idx, task in enumerate(allocate_resource_requests): | ||
| is_success = False | ||
| trace_print(LoggingEventName.DECODE_PROCESS_PREALLOCATE_REQUEST_START, task.request_id, task.user) | ||
|
|
||
| if envs.ENABLE_V1_KVCACHE_SCHEDULER: | ||
| if self.resource_manager.preallocate_resource_in_d(task): | ||
|
|
@@ -2118,6 +2135,7 @@ def _process_allocate_resource_requests(): | |
| self.llm_logger.debug(f"D has successfully sent cache infos for task {task.request_id}") | ||
| processed_indices.append(idx) | ||
| is_success = True | ||
| main_process_metrics.decode_preallocated_req_num.inc() | ||
| else: | ||
| if self.resource_manager.is_resource_sufficient(task.prompt_token_ids_len): | ||
| self.llm_logger.debug(f"D Resource available, processing task {task.request_id}") | ||
|
|
@@ -2137,6 +2155,11 @@ def _process_allocate_resource_requests(): | |
| break | ||
|
|
||
| for idx in sorted(processed_indices, reverse=True): | ||
This comment was marked as outdated.
Sorry, something went wrong. |
||
| trace_print( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 建议 此处引用了拼写错误的 |
||
| LoggingEventName.DECODE_PROCESS_PREALLOCAT_REQUEST_END, | ||
| allocate_resource_requests[idx].request_id, | ||
| allocate_resource_requests[idx].user, | ||
| ) | ||
| allocate_resource_requests.pop(idx) | ||
|
|
||
| def _process_prefilled_requests(): | ||
|
|
@@ -2152,6 +2175,7 @@ def _process_prefilled_requests(): | |
| continue | ||
| req_output.finished = False | ||
| ready_request_outputs.append(req_output) | ||
| trace_print(LoggingEventName.DECODE_PROCESS_PREFILLED_REQUEST_START, req_output.request_id, "") | ||
| self.llm_logger.debug(f"there are enough resource for prefilled request: {req_output.request_id}") | ||
|
|
||
| prefilled_request_ouputs = waiting_request_outputs | ||
|
|
@@ -2164,6 +2188,8 @@ def _process_prefilled_requests(): | |
| else: | ||
| for req_output in ready_request_outputs: | ||
| request_id = req_output.request_id | ||
| main_process_metrics.decode_preallocated_req_num.dec() | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 Bug
当 建议:同样在 V1 分支条件内执行 |
||
| trace_print(LoggingEventName.DECODE_PROCESS_PREFILLED_REQUEST_END, request_id, "") | ||
| if envs.FD_ENABLE_INTERNAL_ADAPTER and not req_output.outputs.token_ids: | ||
| # first token is eos in Prefill, just recycle resource and continue | ||
| self.llm_logger.warning(f"{request_id} need not decode after first token") | ||
|
|
@@ -2177,6 +2203,7 @@ def _process_prefilled_requests(): | |
| self.llm_logger.warning( | ||
| f"{request_id} prefill failed with msg:{req_output.error_msg}, recycle resource." | ||
| ) | ||
| main_process_metrics.failed_recv_first_token_req_num.inc() | ||
| self.resource_manager.pre_recycle_resource(request_id) | ||
| if request_id in self.token_processor.tokens_counter: | ||
| del self.token_processor.tokens_counter[request_id] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -286,8 +286,7 @@ def _process_per_token(self, task, batch_id: int, token_ids: np.ndarray, result: | |
| llm_logger.info(self.resource_manager.info()) | ||
| if self.cfg.speculative_config.method: | ||
| self._compute_speculative_status() | ||
| if not is_prefill: | ||
| self._record_completion_metrics(task, current_time) | ||
| self._record_completion_metrics(task, current_time) | ||
This comment was marked as outdated.
Sorry, something went wrong. |
||
| self._recycle_resources(task_id, batch_id, task, result, is_prefill) | ||
| break | ||
| return result | ||
|
|
@@ -565,6 +564,8 @@ def _recycle_resources(self, task_id, index, task, result=None, is_prefill=False | |
| if is_prefill: | ||
| start_time = time.time() | ||
| result.metrics.wait_for_sending_cache_time = time.time() | ||
| trace_print(LoggingEventName.CHECK_CACHE_TRANSFER_START, task_id, getattr(task, "user", "")) | ||
|
|
||
| while True: | ||
| finished_task_ids = self.engine_worker_queue.get_finished_req() | ||
| if len(finished_task_ids) > 0: | ||
|
|
@@ -588,6 +589,7 @@ def _recycle_resources(self, task_id, index, task, result=None, is_prefill=False | |
| request_id=task_id, | ||
| cost_seconds=f"{time.time()-start_time:.5f}", | ||
| ) | ||
| trace_print(LoggingEventName.CHECK_CACHE_TRANSFER_END, task_id, getattr(task, "user", "")) | ||
| result.metrics.send_request_output_to_decode_time = time.time() | ||
| self.split_connector.send_first_token(task.disaggregate_info, [result]) | ||
| if envs.ENABLE_V1_KVCACHE_SCHEDULER: | ||
|
|
@@ -1036,8 +1038,7 @@ def _process_batch_output(self): | |
| main_process_metrics.request_token_ratio.observe(token_ratio) | ||
| if self.cfg.speculative_config.method: | ||
| self._compute_speculative_status(result) | ||
| if not is_prefill: | ||
| self._record_completion_metrics(task, current_time) | ||
| self._record_completion_metrics(task, current_time) | ||
This comment was marked as outdated.
Sorry, something went wrong. |
||
| log_request( | ||
| RequestLogLevel.STAGES, | ||
| message="task {request_id} received eos token. Recycling.", | ||
|
|
@@ -1089,13 +1090,21 @@ def _record_first_token_metrics(self, task, current_time): | |
|
|
||
| def _record_completion_metrics(self, task, current_time): | ||
| """Record metrics when request completes""" | ||
| role = self.cfg.scheduler_config.splitwise_role | ||
| metrics = task.metrics | ||
| if metrics.engine_recv_first_token_time: | ||
| decode_time = current_time - metrics.engine_recv_first_token_time | ||
| main_process_metrics.request_decode_time.observe(decode_time) | ||
| trace_print(LoggingEventName.INFERENCE_END, task.request_id, getattr(task, "user", "")) | ||
|
|
||
| if role in ("mixed", "decode"): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❓ 疑问 其中包含 如果 Prefill 完成不等于请求完成,此处会导致 |
||
| if metrics.engine_recv_first_token_time: | ||
| decode_time = current_time - metrics.engine_recv_first_token_time | ||
| main_process_metrics.request_decode_time.observe(decode_time) | ||
| trace_print(LoggingEventName.INFERENCE_END, task.request_id, getattr(task, "user", "")) | ||
|
|
||
| if role == "prefill": | ||
| trace_print(LoggingEventName.PREFILL_INFERENCE_END, task.request_id, getattr(task, "user", "")) | ||
| elif role == "decode": | ||
| trace_print(LoggingEventName.DECODE_INFERENCE_END, task.request_id, getattr(task, "user", "")) | ||
|
|
||
| trace_print(LoggingEventName.POSTPROCESSING_START, task.request_id, getattr(task, "user", "")) | ||
| main_process_metrics.num_requests_running.dec(1) | ||
| main_process_metrics.request_success_total.inc() | ||
|
juncaipeng marked this conversation as resolved.
|
||
| main_process_metrics.request_inference_time.observe(current_time - metrics.inference_start_time) | ||
| main_process_metrics.request_generation_tokens.observe(self.tokens_counter[task.request_id]) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,6 +41,20 @@ class LoggingEventName(Enum): | |
| PREEMPTED = "PREEMPTED" | ||
| RESCHEDULED_INFERENCE_START = "RESCHEDULED_INFERENCE_START" | ||
|
|
||
| # For Prefill Instance | ||
| ASK_DECODE_RESOURCE_START = "ASK_DECODE_RESOURCE_START" | ||
| ASK_DECODE_RESOURCE_END = "ASK_DECODE_RESOURCE_END" | ||
| CHECK_CACHE_TRANSFER_START = "CHECK_CACHE_TRANSFER_START" | ||
| CHECK_CACHE_TRANSFER_END = "CHECK_CACHE_TRANSFER_END" | ||
| PREFILL_INFERENCE_END = "PREFILL_INFERENCE_END" | ||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 Bug 枚举命名拼写错误: 与对应的 建议修复: DECODE_PROCESS_PREALLOCATE_REQUEST_END = "DECODE_PROCESS_PREALLOCATE_REQUEST_END"同时需同步修改 trace_print(
LoggingEventName.DECODE_PROCESS_PREALLOCATE_REQUEST_END, # 修正拼写
...
) |
||
| # For Decode Instance | ||
| DECODE_PROCESS_PREALLOCATE_REQUEST_START = "DECODE_PROCESS_PREALLOCATE_REQUEST_START" | ||
| DECODE_PROCESS_PREALLOCAT_REQUEST_END = "DECODE_PROCESS_PREALLOCAT_REQUEST_END" | ||
| DECODE_PROCESS_PREFILLED_REQUEST_START = "DECODE_PROCESS_PREFILLED_REQUEST_START" | ||
| DECODE_PROCESS_PREFILLED_REQUEST_END = "DECODE_PROCESS_PREFILLED_REQUEST_END" | ||
| DECODE_INFERENCE_END = "DECODE_INFERENCE_END" | ||
|
|
||
|
|
||
| class StageName(Enum): | ||
| """ | ||
|
|
@@ -75,4 +89,14 @@ class StageName(Enum): | |
| LoggingEventName.WRITE_CACHE_TO_STORAGE_END: StageName.POSTPROCESSING, | ||
| LoggingEventName.POSTPROCESSING_START: StageName.POSTPROCESSING, | ||
| LoggingEventName.POSTPROCESSING_END: StageName.POSTPROCESSING, | ||
| LoggingEventName.ASK_DECODE_RESOURCE_START: StageName.SCHEDULE, | ||
| LoggingEventName.ASK_DECODE_RESOURCE_END: StageName.SCHEDULE, | ||
| LoggingEventName.CHECK_CACHE_TRANSFER_START: StageName.POSTPROCESSING, | ||
| LoggingEventName.CHECK_CACHE_TRANSFER_END: StageName.POSTPROCESSING, | ||
| LoggingEventName.PREFILL_INFERENCE_END: StageName.PREFILL, | ||
| LoggingEventName.DECODE_PROCESS_PREALLOCATE_REQUEST_START: StageName.DECODE, | ||
| LoggingEventName.DECODE_PROCESS_PREALLOCAT_REQUEST_END: StageName.DECODE, | ||
| LoggingEventName.DECODE_PROCESS_PREFILLED_REQUEST_START: StageName.DECODE, | ||
| LoggingEventName.DECODE_PROCESS_PREFILLED_REQUEST_END: StageName.DECODE, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 Bug 映射表中引用了拼写错误的枚举名: |
||
| LoggingEventName.DECODE_INFERENCE_END: StageName.DECODE, | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.