simple-bayesian-mmm/src/mmm.py at main · thrivent-oss/simple-bayesian-mmm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# Implement the model of Jin and Koehler [1] (Google) for Bayesian MMM with Caryover and Shape
# [1] https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46001.pdf
# This code is covered by the MIT license, as documented within the source repo.

import config  # from config.py
import pandas as pd
from random import random

assert config.s > 0, 'Shape parameter should be positive'
assert config.k > 0, 'Half-saturation should be at t > 0'


def adstock(t: int, x: list[float]) -> float:
	''' Transform the temporal history of spend for one channel to the adstock for a given snapshot.

	Implements Eq. (1) of [1].  The cumulative media effect is a weighted average of media spend in
	the current snapshot and previous (L-1) snapshots.  `adstock` is called by `predict_response`.
	---
	Accepts: t - The temporal index (indexed within x) of the snapshot desired.
	Accepts: x - The spend-like data (cf. ROAS) for this channel.
	Returns: Adstock for snapshot `t` calculated from the previous L snapshots, for this channel.
	'''

	# Respect lower bound of Eq.(1) in [1].
	if t - config.L + 1 < 0:
		return 0

	def w_d(lag):
		''' Delayed adstock decay function (radial kernel basis function).

		Implements Eq. (3) of [1].
		---
		Accepts: lag - Snapshot of lag to generate decay weight for.  `𝑙` in Eq. (3) of [1].
		Returns: Decay weight for the snapshot in question.
		'''
		alpha_m = 0.5  # Variance of w_d is -1/(2 log(alpha_m)).  TODO: tune.
		theta_m = config.L // 2  # Mean of the decay peak.  TODO: tune.
		return alpha_m ** ((lag - theta_m)**2)
	# def w_d

	# weighted sum, over sum of the weights
	num, denom = 0, 0
	for lag in range(config.L):
		wd = w_d(lag)
		num += wd * x[t - lag]
		denom += wd
	# for lag

	return num / denom
	# def w_d
# def adstock


def beta_hill(x: float) -> float:
	''' Transform the spend for one channel with a "shape" function.

	In the context of temporal advertising behavior, the "shape" effect of advertising
	refers to how the timing and distribution of advertising spend over a given period
	influences consumer response, sales, or brand metrics (not just the total amount spent).

	Encodes Eq. (5) in [1].  Called by `predict_response`.
	---
	Accepts: x - The media spend for this channel, for this snapshot.
	Returns: beta_hill - The point transformation of `x` under the shape function.
	'''

	beta = config.beta
	k = config.k
	s = config.s

	transformed = (
		beta - ((beta * (k ** s)) / ((x ** s) + (k ** s)))
	)

	return transformed
# def beta_hill


def predict_response(tau: float, x: list[float], t: int, controls: pd.Series) -> float:
	''' Combine adstock and shape effects to predict response for a given temporal snapshot.

	Implements Eq.(7) in [1].
	---
	Accepts: tau - The baseline response for this time snapshot.
	Accepts: x -The spend-like data (cf. ROAS) for this channel.
	Accepts: t - The temporal index at which to predict the response, transformed into a relative
		index within the range of x (the full spend time series may be sliced to a smaller window).
	Accepts: controls - The series of external controls for this channel at this time; encodes
		z(t,c) from Eq. (7) in [1] as z_t(c), a series of controls for one time snapshot.
	Returns: y_t - The predicted responses for the time snapshot as covered by tau, x, control_df.
	'''

	control_cols = config.control_cols

	assert all(col in controls.index.to_list() for col in control_cols),\
		'Control variable data is in an unexpected format'

	# The size of x can be dynamic (cf. ROAS), so calculate adstock on each call.
	x_star = [adstock(t=_t, x=x) for _t in range(len(x))]

	def make_gammas():
		''' Identify linear coupling coefficients between external controls and response

		TODO: User should implement a custom regression model specific to their problem in order
		to best determine how the external controls couple to the response.
		'''
		return [1] * len(control_cols)  # placeholder coefficients (not realistic)
	# def make_gammas
	gamma_c = make_gammas()

	local_sum = 0
	for ind_c, col in enumerate(control_cols):
		local_sum += controls[col] * gamma_c[ind_c]
	# for c
	externals = local_sum

	# noise with constant variance
	epsilons = 0.1 * tau * random()  # TODO: tune sizes

	y_t = tau + beta_hill(x=x_star[t]) + externals + epsilons

	return y_t
# def predict_response


def create_baseline(
	df: pd.DataFrame,
	efficiency: float = 0.5,
	coupling: float = 0.9
) -> pd.DataFrame:
	''' Calculate a baseline of unlifted target.

	This is a naive approach to baseline calculation, as an explcitit method is not covered in [1].
	---
	Accepts: df - lifted signal, with total spend, pre-aggregated either to national or DMA level
	Accepts: efficiency - Assumed efficiency of all spending in aggregate.  TODO: tune.
	Accepts: coupling - Assumed coupling strength between spend and lift.  TODO: tune.
	Returns: df, updated with added baseline column
	'''

	target_feature = config.target_feature

	assert all(col in df.columns for col in ['spend', target_feature]), 'Unexpected data format'

	max_spend = max(df['spend'])

	df['index'] = df.index
	df['stock'] = df.apply(
		lambda row:
			adstock(t=int(row['index']), x=df['spend'].to_list())
		, axis=1
	)

	def hill_lift(row):
		''' Model lift from baseline as sigmoid-like hill function '''
		max_lift = (max(df[target_feature]) - max_spend * efficiency) * coupling
		half_saturation = max_lift / 10
		curvature = 1

		local_stock = df.loc[int(row['index']), 'stock']

		m = row['spend'] + local_stock
		return max_lift * (m**curvature / (m**curvature + half_saturation**curvature))
	# def hill_lift

	df['baseline'] = df.apply(
		lambda row:
			row[target_feature] - hill_lift(row)
		, axis=1
	)

	return df.drop(columns=['index', 'stock'])
# def create_baseline


def roas(t0: int, t1: int, df: pd.DataFrame, controls_df: pd.DataFrame) -> float:
	''' Calculate the return on ad spend (ROAS) (AKA ROI) for one channel between two time values.

	Implements Eq. (10) from [1].
	TODO: calls to predict_response can be done in two shots instead of a loop, with rewriting.
	---
	Accepts: t0 - The starting temporal index of x to determine ROAS from.
	Accepts: t1 - The ending temporal index of x to determine ROAS until.
	Accepts: df - baseline, lifted signal, ttl spend, aggregated either to national or DMA level
	Accepts: controls_df - The time series of external controls; encodes z(t,c) from Eq. (7) in [1].
	Returns: roas - The calculated ROAS value (change in response per dollar spent on the channel).
	'''

	control_cols = config.control_cols

	assert t0 > config.L - 1,\
		f'ROAS must look up to L units back in history, but L={config.L} and t0={t0}.'
	assert t1 + config.L < len(df) + 1,\
		'ROAS must consider adstock effects up to L units into the future, but '\
		f't1+L={t1+config.L} and len(df)={len(df)}.'
	assert all(col in df.columns for col in ['baseline', 'spend']),\
		'Baseline data passed to ROAS is in an unexpected format'
	assert all(col in controls_df.columns for col in control_cols),\
		'Control variable data passed to ROAS is in an unexpected format'

	x = df['spend'].to_list()
	x_tilde = (  # zero during calculation window, x otherwise
		x[:t0]
		+ [0] * (t1 + 1 - t0)
		+ x[t1 + 1:]
	)

	# Collect references for efficient calls.
	taus = df['baseline'].to_list()

	numerator = 0
	for t in range(t0, t1 + config.L):
		# Predicted response in the [t0, t1] window, with actual spend.  LHS of top of Eq. (10).
		pred_window = predict_response(
			tau=taus[t],
			x=x,
			t=t,  # snapshot at t=t is the final element of the x which was passed.
			controls=controls_df.iloc[t]
		)

		# Predicted response after t1, with zero spend.  RHS of top of Eq. (10).
		pred_future = predict_response(
			tau=taus[t],
			x=x_tilde,
			t=t,  # snapshot at t=t is the final element of the x which was passed.
			controls=controls_df.iloc[t]
		)

		numerator += pred_window - pred_future
	# for t

	denomerator = sum(x[t0:t1 + 1])
	roas = numerator / denomerator

	return roas
# def roas