Skip to content

Commit cc623d7

Browse files
alcoclaude
andauthored
Add ETS table memory observability metrics (#3632)
## Summary - Adds `ElectricTelemetry.EtsTables` module that collects memory usage statistics from ETS tables, grouping by "type" (extracted from table names using pattern matching on UUID suffixes) - Integrates ETS table metrics into the telemetry pipeline via a new `ets_table_memory` periodic measurement in `ApplicationTelemetry` - Exports `ets_table.memory.total` metric with `table_type` tag, following the same pattern as `process.memory.total` - Adds configurable `top_ets_table_count` option (default: 10) ## Changes - **New**: `ElectricTelemetry.EtsTables` - functions for collecting top N ETS tables by memory (individually and grouped by type) - **Modified**: `ElectricTelemetry.ApplicationTelemetry` - added `ets_table_memory/1` periodic measurement and metric definition - **Modified**: `ElectricTelemetry.Opts` - added `top_ets_table_count` configuration option ## Test plan - [x] Unit tests for `EtsTables` module (13 tests passing) - [x] Tests cover type extraction from colon-separated and underscore-separated names - [x] Tests verify grouping, sorting, and statistics calculations - [x] Compilation with `--warnings-as-errors` passes 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 11b151b commit cc623d7

File tree

7 files changed

+601
-5
lines changed

7 files changed

+601
-5
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@core/electric-telemetry": minor
3+
---
4+
5+
Add ETS table memory observability metrics, reporting top N table types by memory usage similar to process memory metrics

packages/electric-telemetry/lib/electric/telemetry/application_telemetry.ex

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ defmodule ElectricTelemetry.ApplicationTelemetry do
7676
:garbage_collection,
7777
:reductions,
7878
:process_memory,
79+
:ets_memory,
7980
:get_system_load_average,
8081
:get_system_memory_usage
8182
],
@@ -86,6 +87,7 @@ defmodule ElectricTelemetry.ApplicationTelemetry do
8687
def metrics(telemetry_opts) do
8788
[
8889
last_value("process.memory.total", tags: [:process_type], unit: :byte),
90+
last_value("ets.memory.total", tags: [:table_type], unit: :byte),
8991
last_value("system.cpu.core_count"),
9092
last_value("system.cpu.utilization.total"),
9193
last_value("system.load_percent.avg1"),
@@ -176,6 +178,13 @@ defmodule ElectricTelemetry.ApplicationTelemetry do
176178
end
177179
end
178180

181+
def ets_memory(%{intervals_and_thresholds: %{top_ets_table_count: ets_table_count}}) do
182+
for %{type: type, memory: memory} <-
183+
ElectricTelemetry.EtsTables.top_by_type(ets_table_count) do
184+
:telemetry.execute([:ets, :memory], %{total: memory}, %{table_type: to_string(type)})
185+
end
186+
end
187+
179188
def cpu_utilization(_) do
180189
case :cpu_sup.util([:per_cpu]) do
181190
{:error, reason} ->
Lines changed: 287 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,287 @@
1+
defmodule ElectricTelemetry.EtsTables do
2+
@moduledoc """
3+
Functions for collecting memory usage statistics from ETS tables.
4+
5+
This module provides functions to:
6+
- Get the top N individual ETS tables by memory usage
7+
- Get the top M "types" of ETS tables by aggregated memory usage
8+
9+
ETS table "types" are extracted from table names using pattern matching:
10+
- Tables with `ModuleName:stack_id` format are grouped by the module name prefix
11+
- Tables with `name_uuid` format are grouped by the name prefix
12+
- Tables with identical names (unnamed/anonymous tables) are grouped together
13+
- All other tables are treated as unique types
14+
"""
15+
16+
@default_individual_count 10
17+
@default_type_count 10
18+
19+
@doc """
20+
Returns the top N individual ETS tables by memory usage.
21+
22+
## Parameters
23+
- `count` - Number of top tables to return (default: #{@default_individual_count})
24+
25+
## Returns
26+
A list of maps with `:name`, `:type`, `:memory`, `:size`, `:type_table_count`, and `:avg_size_per_type` keys,
27+
sorted by memory (descending).
28+
29+
## Examples
30+
31+
iex> ElectricTelemetry.EtsTables.top_tables(5)
32+
[
33+
%{name: :filter_shapes, type: "filter_shapes", memory: 605630464, size: 200122,
34+
type_table_count: 1, avg_size_per_type: 200122.0},
35+
%{name: :"shapedb:shape_lookup:6dd7c00b-...", type: "shapedb:shape_lookup", memory: 605625344,
36+
size: 200122, type_table_count: 2, avg_size_per_type: 200122.0},
37+
...
38+
]
39+
"""
40+
def top_tables(count \\ @default_individual_count)
41+
42+
def top_tables(count) when is_integer(count) and count > 0 do
43+
word_size = word_size()
44+
all_table_info = Enum.flat_map(:ets.all(), &table_info(&1, word_size))
45+
type_stats = calculate_type_stats(all_table_info)
46+
47+
# Enrich each table with type statistics
48+
all_table_info
49+
|> Enum.map(fn table ->
50+
stats = Map.get(type_stats, table.type)
51+
52+
Map.merge(table, %{
53+
type_table_count: stats.count,
54+
avg_size_per_type: stats.avg_size
55+
})
56+
end)
57+
|> Enum.sort_by(& &1.memory, :desc)
58+
|> Enum.take(count)
59+
end
60+
61+
@doc """
62+
Returns the top M "types" of ETS tables by aggregated memory usage.
63+
64+
Table types are extracted from table names:
65+
- `ModuleName:stack_id` → grouped by `ModuleName`
66+
- `name_uuid` → grouped by `name`
67+
- Identical names → grouped together
68+
- Other → each table is its own type
69+
70+
## Parameters
71+
- `count` - Number of top types to return (default: #{@default_type_count})
72+
73+
## Returns
74+
A list of maps with `:type`, `:memory`, `:table_count`, and `:avg_size` keys, sorted by memory (descending).
75+
76+
## Examples
77+
78+
iex> ElectricTelemetry.EtsTables.top_by_type(5)
79+
[
80+
%{type: "Elixir.Electric.Registry.ShapeChange", memory: 6815744, table_count: 29, avg_size: 785.5},
81+
%{type: "tls_socket", memory: 60928, table_count: 24, avg_size: 1.0},
82+
...
83+
]
84+
"""
85+
def top_by_type(count \\ @default_type_count)
86+
87+
def top_by_type(count) when is_integer(count) and count > 0 do
88+
word_size = word_size()
89+
all_table_info = Enum.flat_map(:ets.all(), &table_info(&1, word_size))
90+
type_stats = calculate_type_stats(all_table_info)
91+
92+
type_stats
93+
|> Enum.map(fn {type, stats} ->
94+
%{
95+
type: type,
96+
memory: stats.total_memory,
97+
table_count: stats.count,
98+
avg_size: stats.avg_size
99+
}
100+
end)
101+
|> Enum.sort_by(& &1.memory, :desc)
102+
|> Enum.take(count)
103+
end
104+
105+
@doc """
106+
Returns both top individual tables and top types in a single call.
107+
108+
This is more efficient than calling both functions separately if you need both results.
109+
110+
## Parameters
111+
- `individual_count` - Number of top individual tables (default: #{@default_individual_count})
112+
- `type_count` - Number of top types (default: #{@default_type_count})
113+
114+
## Returns
115+
A map with `:top_tables` and `:top_by_type` keys.
116+
117+
## Examples
118+
119+
iex> ElectricTelemetry.EtsTables.top_memory_stats(5, 3)
120+
%{
121+
top_tables: [...],
122+
top_by_type: [...]
123+
}
124+
"""
125+
def top_memory_stats(
126+
individual_count \\ @default_individual_count,
127+
type_count \\ @default_type_count
128+
)
129+
130+
def top_memory_stats(individual_count, type_count)
131+
when is_integer(individual_count) and individual_count > 0 and
132+
is_integer(type_count) and type_count > 0 do
133+
word_size = word_size()
134+
all_table_info = Enum.flat_map(:ets.all(), &table_info(&1, word_size))
135+
type_stats = calculate_type_stats(all_table_info)
136+
137+
top_tables =
138+
all_table_info
139+
|> Enum.map(fn table ->
140+
stats = Map.get(type_stats, table.type)
141+
142+
Map.merge(table, %{
143+
type_table_count: stats.count,
144+
avg_size_per_type: stats.avg_size
145+
})
146+
end)
147+
|> Enum.sort_by(& &1.memory, :desc)
148+
|> Enum.take(individual_count)
149+
150+
top_by_type =
151+
type_stats
152+
|> Enum.map(fn {type, stats} ->
153+
%{
154+
type: type,
155+
memory: stats.total_memory,
156+
table_count: stats.count,
157+
avg_size: stats.avg_size
158+
}
159+
end)
160+
|> Enum.sort_by(& &1.memory, :desc)
161+
|> Enum.take(type_count)
162+
163+
%{
164+
top_tables: top_tables,
165+
top_by_type: top_by_type
166+
}
167+
end
168+
169+
# Private functions
170+
171+
defp table_info(table_ref, word_size) do
172+
case :ets.info(table_ref, :name) do
173+
:undefined ->
174+
# Table was deleted between :ets.all() and this call
175+
nil
176+
177+
name ->
178+
type = table_type(name)
179+
memory = table_memory_words(table_ref) * word_size
180+
size = table_size(table_ref)
181+
182+
if memory > 0 do
183+
[%{name: name, type: type, memory: memory, size: size}]
184+
end
185+
end || []
186+
end
187+
188+
defp calculate_type_stats(all_table_info) do
189+
all_table_info
190+
|> Enum.group_by(& &1.type)
191+
|> Enum.map(fn {type, tables} ->
192+
{count, total_memory, total_size} =
193+
Enum.reduce(tables, {0, 0, 0}, fn table, {count, mem, size} ->
194+
{count + 1, mem + table.memory, size + table.size}
195+
end)
196+
197+
avg_size = if count > 0, do: total_size / count, else: 0.0
198+
199+
{type,
200+
%{
201+
total_memory: total_memory,
202+
count: count,
203+
avg_size: avg_size
204+
}}
205+
end)
206+
|> Map.new()
207+
end
208+
209+
defp table_memory_words(table_ref) do
210+
case :ets.info(table_ref, :memory) do
211+
:undefined -> 0
212+
words when is_integer(words) -> words
213+
end
214+
end
215+
216+
defp table_size(table_ref) do
217+
case :ets.info(table_ref, :size) do
218+
:undefined -> 0
219+
size when is_integer(size) -> size
220+
end
221+
end
222+
223+
defp table_type(name) when is_atom(name) do
224+
name
225+
|> Atom.to_string()
226+
|> extract_type_from_name()
227+
end
228+
229+
defp table_type(name) when is_binary(name) do
230+
extract_type_from_name(name)
231+
end
232+
233+
defp table_type(name), do: inspect(name)
234+
235+
# UUID pattern: 8 hex digits, optionally followed by -4hex-4hex-4hex-12hex
236+
# We're looking for this pattern after a colon or underscore
237+
@uuid_pattern ~r/^[0-9a-f]{8}(-[0-9a-f]{4}){0,3}(-[0-9a-f]{1,12})?/i
238+
239+
defp extract_type_from_name(name_string) when is_binary(name_string) do
240+
cond do
241+
# Pattern 1: ModuleName:stack_id (e.g., "Electric.StatusMonitor:6dd7c00b-8e31")
242+
# Extract everything before the last colon that precedes a UUID-like pattern
243+
String.contains?(name_string, ":") ->
244+
extract_type_with_separator(name_string, ":", @uuid_pattern)
245+
246+
# Pattern 2: name_stack_id (e.g., "stack_call_home_telemetry_6dd7c00b-8")
247+
# Extract everything before the last underscore that precedes a UUID-like pattern
248+
String.contains?(name_string, "_") ->
249+
extract_type_with_separator(name_string, "_", @uuid_pattern)
250+
251+
# Pattern 3: No pattern detected, use the full name as the type
252+
true ->
253+
name_string
254+
end
255+
end
256+
257+
defp extract_type_with_separator(name_string, separator, uuid_pattern) do
258+
# Split by the separator and try to find where the UUID starts
259+
parts = String.split(name_string, separator)
260+
261+
# Find the index where UUID pattern starts
262+
uuid_start_index =
263+
parts
264+
|> Enum.with_index()
265+
|> Enum.find_index(fn {part, _idx} ->
266+
String.match?(part, uuid_pattern)
267+
end)
268+
269+
case uuid_start_index do
270+
nil ->
271+
# No UUID pattern found, return the full name
272+
name_string
273+
274+
0 ->
275+
# UUID starts at the beginning (unlikely but handle it)
276+
name_string
277+
278+
index ->
279+
# Take all parts before the UUID and rejoin them
280+
parts
281+
|> Enum.take(index)
282+
|> Enum.join(separator)
283+
end
284+
end
285+
286+
defp word_size, do: :erlang.system_info(:wordsize)
287+
end

packages/electric-telemetry/lib/electric/telemetry/opts.ex

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ defmodule ElectricTelemetry.Opts do
3535
]},
3636
default: {:count, 5}
3737
],
38+
top_ets_table_count: [type: :integer, default: 10],
3839
# Garbage collection should run almost instantly since each process has its own heap that
3940
# is garbage collected independently of others. 50ms might be too generous.
4041
long_gc_threshold: [type: :integer, default: 50],

0 commit comments

Comments
 (0)