Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,10 @@ pub struct CollationManager {
leaf_scheduling_info: HashMap<Hash, LeafSchedulingInfo>,
// Clock for time reads (V3 scheduling-parent slot validation, advertisement timestamps).
clock: Arc<dyn Clock>,
// Rate-limiting state for the (potentially frequent) collation-fetch error warnings, so a
// flaky network or a buggy `Canceled` loop can't flood the logs.
network_error_freq: gum::Freq,
canceled_freq: gum::Freq,
}

impl CollationManager {
Expand All @@ -133,6 +137,8 @@ impl CollationManager {
keystore,
leaf_scheduling_info: HashMap::default(),
clock,
network_error_freq: gum::Freq::new(),
canceled_freq: gum::Freq::new(),
};

instance.update_view(sender, OurView::new([active_leaf.hash], 0)).await?;
Expand Down Expand Up @@ -205,6 +211,13 @@ impl CollationManager {
if !self.implicit_view.paths_via_relay_parent(&sp).is_empty() {
return Some((sp, per_sp));
}

gum::trace!(
target: LOG_TARGET,
scheduling_parent = ?sp,
"Scheduling parent no longer reachable from any leaf; dropping it and cancelling its in-flight fetches",
);

for (advertisement, _) in per_sp.all_advertisements() {
self.fetching.cancel(advertisement);
}
Expand Down Expand Up @@ -252,6 +265,13 @@ impl CollationManager {
continue;
},
};
gum::trace!(
target: LOG_TARGET,
scheduling_parent = ?ancestor,
?core,
session_index,
"Registered scheduling parent on our assigned core",
);
self.per_scheduling_parent
.insert(*ancestor, PerSchedulingParent::new(session_index, core, &*self.clock));
}
Expand Down Expand Up @@ -606,7 +626,11 @@ impl CollationManager {
return CanSecond::No(None, reject_info);
};

match process_collation_fetch_result(res) {
match process_collation_fetch_result(
res,
&mut self.network_error_freq,
&mut self.canceled_freq,
) {
Ok(fetched_collation) => {
let candidate_hash = fetched_collation.candidate_receipt.hash();
// It can't be a duplicate, because we check before initiating fetch. For the old
Expand Down Expand Up @@ -850,7 +874,14 @@ impl CollationManager {
.collect();

// `Ord` is custom: descending by score, so first = best.
let Some(best) = advertisements.first() else { return Either::Left(None) };
let Some(best) = advertisements.first() else {
gum::trace!(
target: LOG_TARGET,
?para_id,
"No fetchable advertisement for a free claim-queue slot",
);
return Either::Left(None);
};

let delay = Self::calculate_delay(best.score, highest_rep_of_para);

Expand All @@ -872,6 +903,14 @@ impl CollationManager {
);
Either::Left(Some(*best.adv))
} else {
gum::trace!(
target: LOG_TARGET,
peer_id = ?best.adv.peer_id,
scheduling_parent = ?best.adv.scheduling_parent,
?para_id,
?remaining,
"Best advertisement is fetch-delayed; will fetch once the delay elapses",
);
Either::Right(remaining)
}
}
Expand Down Expand Up @@ -1361,6 +1400,8 @@ async fn fetch_pvd<Sender: CollatorProtocolSenderTrait>(

fn process_collation_fetch_result(
(advertisement, res): CollationFetchResponse,
network_error_freq: &mut gum::Freq,
canceled_freq: &mut gum::Freq,
) -> std::result::Result<FetchedCollation, Option<Score>> {
match res {
Err(CollationFetchError::Cancelled) => {
Expand All @@ -1385,7 +1426,9 @@ fn process_collation_fetch_result(
Err(Some(FAILED_FETCH_SLASH))
},
Err(CollationFetchError::Request(RequestError::NetworkError(err))) => {
gum::warn!(
gum::warn_if_frequent!(
freq: network_error_freq,
max_rate: gum::Times::PerHour(100),
target: LOG_TARGET,
?advertisement,
err = ?err,
Expand All @@ -1394,7 +1437,9 @@ fn process_collation_fetch_result(
Err(None)
},
Err(CollationFetchError::Request(RequestError::Canceled(err))) => {
gum::warn!(
gum::warn_if_frequent!(
freq: canceled_freq,
max_rate: gum::Times::PerHour(100),
target: LOG_TARGET,
?advertisement,
err = ?err,
Expand Down Expand Up @@ -1691,6 +1736,8 @@ mod tests {
keystore: Arc::new(sc_keystore::LocalKeystore::in_memory()),
leaf_scheduling_info: HashMap::default(),
clock: polkadot_node_clock::system_clock(),
network_error_freq: gum::Freq::new(),
canceled_freq: gum::Freq::new(),
};

// No advertisements - returns Left(None).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,12 @@ async fn initialize<Context>(

let scheduled_paras = collation_manager.assignments();

gum::info!(

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this important to be logged at info level ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, let's downgrade this to debug too.
My initial idea was to have information about assignments on INFO level, but it will be too noisy.

target: LOG_TARGET,
?scheduled_paras,
"Collator protocol initial assignments",
);

// Create PersistentDb with disk persistence
let (backend, task) = match PersistentDb::new(
db.clone(),
Expand Down Expand Up @@ -183,7 +189,7 @@ async fn wait_for_first_leaf<Context>(ctx: &mut Context) -> FatalResult<Option<A
CollatorProtocolMessage::NetworkBridgeUpdate(
NetworkBridgeEvent::PeerConnected(peer_id, ..),
) => {
gum::info!(
gum::debug!(
target: LOG_TARGET,
?peer_id,
"Disconnecting peer that connected before subsystem initialization",
Expand All @@ -197,7 +203,7 @@ async fn wait_for_first_leaf<Context>(ctx: &mut Context) -> FatalResult<Option<A
CollatorProtocolMessage::NetworkBridgeUpdate(
NetworkBridgeEvent::PeerMessage(peer_id, ..),
) => {
gum::info!(
gum::debug!(
target: LOG_TARGET,
?peer_id,
"Disconnecting peer that sent message before subsystem initialization",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,24 @@ impl<B: Backend> PeerManager<B> {
.db
.process_bumps(finalized_block_number, bumps, Some(Score::new(INACTIVITY_DECAY)), now)
.await;

if !updates.is_empty() {
gum::debug!(
target: LOG_TARGET,
finalized_block_number,
?finalized_block_hash,
num_updates = updates.len(),
"Applying reputation updates on new finalized block",
);
}

for update in updates {
gum::trace!(
target: LOG_TARGET,
finalized_block_number,
?update,
"Applying reputation update",
);
self.connected.update_reputation(update);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,15 @@ impl<B: Backend> State<B> {
"Old assignments vs new assignments",
);

if old_assignments != new_assignments {
gum::info!(

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not debug ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right, this is not a good idea.
groupRotation is 10 on polkadot, so we'll have this log ~each minute, which is not worth it. I'll downgrade it to debug.

target: LOG_TARGET,
?old_assignments,
?new_assignments,
"Collator protocol assignments changed",
);
}

let maybe_disconnected_peers =
self.peer_manager.scheduled_paras_update(sender, new_assignments).await;

Expand Down Expand Up @@ -468,19 +477,48 @@ impl<B: Backend> State<B> {
)
.await;

if let Some((peer_id, PeerInfo { version, .. })) = peer_id
.and_then(|peer_id| self.peer_manager.peer_info(&peer_id).map(|info| (peer_id, info)))
{
gum::debug!(
target: LOG_TARGET,
?para_id,
?scheduling_parent,
?candidate_hash,
?peer_id,
"Notifying collator about seconded collation",
);
notify_collation_seconded(sender, peer_id, *version, scheduling_parent, statement)
.await;
match peer_id {
Some(peer_id) => match self.peer_manager.peer_info(&peer_id) {
Some(PeerInfo { version, .. }) => {
gum::debug!(
target: LOG_TARGET,
?para_id,
?scheduling_parent,
?candidate_hash,
?peer_id,
"Notifying collator about seconded collation",
);
notify_collation_seconded(
sender,
peer_id,
*version,
scheduling_parent,
statement,
)
.await;
},
// We know who fetched it, but they disconnected before we could ack the second.
None => {
gum::trace!(
target: LOG_TARGET,
?para_id,
?scheduling_parent,
?candidate_hash,
?peer_id,
"Not notifying collator about seconded collation: peer no longer connected",
);
},
},
// No tracked fetcher for this candidate (e.g. its slot was already released).
None => {
gum::trace!(
target: LOG_TARGET,
?para_id,
?scheduling_parent,
?candidate_hash,
"Not notifying any collator about seconded collation: fetcher unknown",
);
},
}

if !unblocked_collations.is_empty() {
Expand Down
19 changes: 19 additions & 0 deletions prdoc/pr_12282.prdoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
title: Logging improvements for the collator revamp
doc:
- audience: Node Dev
description: |-
Some logging updates:

- Decrease log levels in `wait_for_first_leaf` to DEBUG, to avoid startup spam
- Use `warn_if_frequent` for fetch errors
- Log assignment changes on view change
- pick_best_advertisement: trace logs for each outcome
- update_view: log scheduling parent <-> assigned core mapping
- update_view: log sp removal
- handle_seconded_collation: logs for each error case
- PeerManager: log reputation updates

Partially addresses https://github.com/paritytech/polkadot-sdk/issues/10402
crates:
- name: polkadot-collator-protocol
bump: patch
Loading