Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 3 additions & 21 deletions aws/eks.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,25 +29,7 @@ locals {
ECRPullThroughCacheMin = aws_iam_policy.ecr_pull_through_cache_min[0].arn
} : {}

eks_node_type_presets = {
dev = ["m6a.xlarge", "m6a.2xlarge"]
prod-small = ["m8a.xlarge", "m8a.2xlarge"]
prod-large = ["m8a.xlarge", "m8a.2xlarge", "m8a.4xlarge"]
prod-xl = ["m8a.xlarge", "m8a.2xlarge", "m8a.4xlarge"]
}

eks_starrocks_node_type_presets = {
dev = ["r8a.large", "m8a.xlarge"]
prod-small = ["r8a.large", "r8a.xlarge"]
prod-xl = ["r8a.large", "r8a.8xlarge"]
}

# There is no dedicated prod-large StarRocks profile; fall back to prod-xl
# StarRocks sizing when size_profile is prod-large (see gdcn-size-prod-large).
starrocks_size_profile_effective = coalesce(var.starrocks_size_profile, var.size_profile == "prod-large" ? "prod-xl" : var.size_profile)

eks_node_types = coalesce(var.eks_node_types, local.eks_node_type_presets[var.size_profile])
eks_starrocks_node_types = coalesce(var.eks_starrocks_node_types, local.eks_starrocks_node_type_presets[local.starrocks_size_profile_effective])
# Node types / StarRocks node types / autoscaler ceiling: resolved in size-profiles.tf.

# Per-AZ node groups for StarRocks so the cluster autoscaler can scale
# nodes in the AZ where the FE/CN EBS volume lives (EBS is zonal).
Expand Down Expand Up @@ -130,7 +112,7 @@ module "eks" {
}, local.ecr_pull_through_cache_policy)

min_size = 0
max_size = var.eks_max_nodes
max_size = local.eks_max_nodes

# This value is ignored after the initial creation
# https://github.com/bryantbiggs/eks-desired-size-hack
Expand Down Expand Up @@ -174,7 +156,7 @@ module "eks" {
}, local.ecr_pull_through_cache_policy)

min_size = 0
max_size = var.eks_max_nodes
max_size = local.eks_max_nodes
desired_size = 0
}
},
Expand Down
2 changes: 1 addition & 1 deletion aws/k8s-aws.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ module "k8s_aws" {
ingress_controller = var.ingress_controller
dns_provider = var.dns_provider
route53_zone_id = var.route53_zone_id
size_profile = var.size_profile
ingress_replicas = local.profile.ingress_replicas

registry_k8sio = local.registry_k8sio

Expand Down
7 changes: 5 additions & 2 deletions aws/k8s-common.tf
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,11 @@ module "k8s_common" {
gdcn_license_key = var.gdcn_license_key
gdcn_orgs = var.gdcn_orgs
gdcn_helm_extra_values = var.gdcn_helm_extra_values
size_profile = var.size_profile
starrocks_size_profile = local.starrocks_size_profile_effective
ingress_replicas = local.profile.ingress_replicas
gdcn_size = local.profile.gdcn_size
pulsar_size = local.profile.pulsar_size
observability_size = local.profile.observability_size
starrocks_size_profile = var.starrocks_size_profile
cloud = "aws"
ingress_controller = var.ingress_controller
gdcn_irsa_role_arn = aws_iam_role.gdcn_irsa.arn
Expand Down
21 changes: 19 additions & 2 deletions aws/rds.tf
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,28 @@ module "rds_postgresql" {
engine = "postgres"
engine_version = data.aws_rds_engine_version.default.version
family = "postgres${split(".", data.aws_rds_engine_version.default.version)[0]}"
instance_class = var.rds_instance_class
allocated_storage = 20
instance_class = local.rds_instance_class
allocated_storage = local.rds_allocated_storage
apply_immediately = true
allow_major_version_upgrade = var.rds_allow_major_version_upgrade

# Performance parameters tuned by size_profile (see size-profiles.tf). Both are
# dynamic (apply_method = immediate, no reboot). shared_buffers/effective_cache_size
# are intentionally left to the RDS instance-class defaults, which already scale
# with instance memory. Values are in kB.
parameters = [
{
name = "work_mem"
value = tostring(local.profile.postgres.work_mem_mb * 1024)
apply_method = "immediate"
},
{
name = "maintenance_work_mem"
value = tostring(local.profile.postgres.maintenance_work_mem_mb * 1024)
apply_method = "immediate"
},
]

# Database name & credentials
username = local.db_username
password_wo = local.db_password
Expand Down
38 changes: 20 additions & 18 deletions aws/settings.tfvars.example
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ aws_profile_name = "my-profile"
# Region to deploy resources to
aws_region = "eu-central-1"

# Use an existing VPC instead of creating a new one
# Uncomment to use an existing VPC instead of creating a new one
# existing_vpc_id = "vpc-0123456789abcdef0"
# existing_private_subnet_ids = ["subnet-aaa", "subnet-bbb"]
# existing_public_subnet_ids = ["subnet-ccc", "subnet-ddd"]

# Additional tags to apply to all AWS resources
# Uncomment to apply additional tags to all AWS resources
# aws_additional_tags = {
# Environment = "dev"
# }
Expand All @@ -26,26 +26,29 @@ deployment_name = "gooddata-cn"
# GoodData.CN license key
gdcn_license_key = "key/asdf==" # provided by GoodData

# Additional gooddata-cn Helm values
# Uncomment to append extra gooddata-cn Helm values
# gdcn_helm_extra_values = <<-EOT
# service:
# someComponent:
# replicaCount: 3
# EOT

# Deploys and enables GoodData generative AI services
# Uncomment to deploy and enable GoodData generative AI services
# enable_ai_features = true

# Enable experimental features (unofficially unsupported and unstable; talk to GoodData to learn more)
# Uncomment to enable experimental features (unofficially unsupported and unstable; talk to GoodData to learn more)
# enable_experimental_features = true

# Size profile controls replicas/resources across services:
# Size profile:
# - dev: lowest footprint, not HA
# - prod-small: HA (>= 2 replica per service), resources for services increased
# - prod-large: HA, sized for large/high-traffic deployments (more replicas +
# CPU/memory for the hot-path services than prod-small)
# NOTE: switching size profile after deploying isn't supported since persistent
# volume sizes can't be automatically adjusted in-place.
# - prod-small: HA (>= 2 replicas per service), increased resources
# - prod-large: HA, sized for large/high-traffic deployments
# It sets sensible defaults for the metadata DB tier/storage, worker node
# sizes/counts, observability disk sizes, and GoodData.CN/subchart sizing, so you
# don't set those individually. To override one default, add its variable here
# yourself (e.g. rds_instance_class, eks_node_types); see variables.tf for the list.
# NOTE: changing size_profile after deploying isn't supported (persistent volume
# sizes can't be resized in place).
size_profile = "prod-small"

###
Expand Down Expand Up @@ -77,7 +80,7 @@ tls_mode = "acm"
# tls_mode = "letsencrypt"
# letsencrypt_email = "me@example.com"

# When ingress-nginx sits behind another L7 proxy/load balancer, set this to true.
# Uncomment when ingress-nginx sits behind another L7 proxy/load balancer.
# ingress_nginx_behind_l7 = true

# Option 3 — Istio Gateway + Let's Encrypt certificates (letsencrypt)
Expand Down Expand Up @@ -121,8 +124,8 @@ gdcn_orgs = [
# Uncomment to enable the observability stack
# enable_observability = true
# observability_hostname = "observability.gooddata.example.com"
#
# Observability data retention (optional; defaults shown). Each signal has its own 5Gi

# Uncomment to override data retention (defaults shown). Each signal has its own 5Gi
# PVC, so lower these to bound storage. Loki must be a multiple of 24h.
# loki_retention_period = "168h"
# prometheus_retention_period = "168h"
Expand All @@ -143,15 +146,14 @@ gdcn_orgs = [
# enable_ai_lake = true

###
# EKS & infrastructure sizing (optional)
# EKS & infrastructure (optional)
###
# Uncomment to override EKS defaults
# eks_version = "1.35"
# eks_node_types = ["m6i.xlarge"]
# eks_max_nodes = 10
# eks_endpoint_public_access = true
# eks_endpoint_public_access_cidrs = ["x.x.x.x/32"]
# eks_endpoint_private_access = false

# rds_instance_class = "db.t4g.medium"
# Uncomment to override RDS defaults
# rds_deletion_protection = false
# rds_skip_final_snapshot = true
97 changes: 97 additions & 0 deletions aws/size-profiles.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
###
# Single source of truth for AWS sizing per size_profile: managed infra (RDS,
# EKS nodes, autoscaler ceiling, ingress replicas) inline, plus workload
# (GoodData.CN/Pulsar/observability) sizing referenced by name. StarRocks (AI
# Lake) is sized separately via var.starrocks_size_profile. Override any managed
# value via the matching var.* input.
###

locals {
size_profiles = {
dev = {
rds = {
instance_class = "db.t4g.medium"
allocated_storage = 20
}
eks_node_types = ["m6a.xlarge", "m6a.2xlarge"]
starrocks_node_types = ["r8a.large", "m8a.xlarge"]
eks_max_nodes = 6
ingress_replicas = 1
postgres = {
work_mem_mb = 8
maintenance_work_mem_mb = 128
}
gdcn_size = "dev"
pulsar_size = "dev"
observability_size = "dev"
}
prod-small = {
rds = {
instance_class = "db.r6g.large"
allocated_storage = 100
}
eks_node_types = ["m8a.xlarge", "m8a.2xlarge"]
starrocks_node_types = ["r8a.large", "r8a.xlarge"]
eks_max_nodes = 12
ingress_replicas = 2
postgres = {
work_mem_mb = 16
maintenance_work_mem_mb = 256
}
gdcn_size = "prod-small"
pulsar_size = "prod-small"
observability_size = "prod-small"
}
prod-large = {
rds = {
instance_class = "db.r6g.xlarge"
allocated_storage = 100
}
eks_node_types = ["m8a.xlarge", "m8a.2xlarge", "m8a.4xlarge"]
# Unused: StarRocks has no prod-large tier (starrocks_size_profile can only
# be dev/prod-small/prod-xl), so this is never selected. Present for type
# consistency across the map.
starrocks_node_types = ["r8a.large", "r8a.8xlarge"]
eks_max_nodes = 20
ingress_replicas = 3
postgres = {
work_mem_mb = 32
maintenance_work_mem_mb = 512
}
gdcn_size = "prod-large"
pulsar_size = "prod-large"
observability_size = "prod-large"
}
prod-xl = {
rds = {
instance_class = "db.r6g.2xlarge"
allocated_storage = 200
}
eks_node_types = ["m8a.xlarge", "m8a.2xlarge", "m8a.4xlarge"]
starrocks_node_types = ["r8a.large", "r8a.8xlarge"]
eks_max_nodes = 30
ingress_replicas = 3
postgres = {
work_mem_mb = 64
maintenance_work_mem_mb = 1024
}
# No prod-xl GDCN/Pulsar/observability spec; fold to prod-large (explicit).
gdcn_size = "prod-large"
pulsar_size = "prod-large"
observability_size = "prod-large"
}
}

profile = local.size_profiles[var.size_profile]

# Resolved size_profile values (profile default, overridable via var.*).
rds_instance_class = coalesce(var.rds_instance_class, local.profile.rds.instance_class)
rds_allocated_storage = coalesce(var.rds_allocated_storage, local.profile.rds.allocated_storage)
eks_node_types = coalesce(var.eks_node_types, local.profile.eks_node_types)
eks_max_nodes = coalesce(var.eks_max_nodes, local.profile.eks_max_nodes)

# StarRocks node pool: indexed by the explicit var.starrocks_size_profile, NOT
# size_profile (the two are decoupled). Only used when enable_ai_lake is true,
# which the variable's validation requires.
eks_starrocks_node_types = var.enable_ai_lake ? coalesce(var.eks_starrocks_node_types, local.size_profiles[var.starrocks_size_profile].starrocks_node_types) : []
}
32 changes: 21 additions & 11 deletions aws/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ variable "eks_endpoint_public_access_cidrs" {
}

variable "eks_max_nodes" {
description = "Maximum number of EKS worker nodes"
description = "Maximum number of EKS worker nodes (autoscaler ceiling). If null, chosen by size_profile."
type = number
default = 20
default = null
}

variable "eks_node_types" {
Expand All @@ -101,7 +101,7 @@ variable "eks_node_types" {
}

variable "eks_starrocks_node_types" {
description = "EC2 instance types for the StarRocks-dedicated EKS pool (taint workload=starrocks). If null, defaults to a preset chosen by starrocks_size_profile (or size_profile when unset)."
description = "EC2 instance types for the StarRocks-dedicated EKS pool (taint workload=starrocks). If null, defaults to a preset chosen by starrocks_size_profile."
type = list(string)
default = null
}
Expand Down Expand Up @@ -410,10 +410,10 @@ variable "observability_hostname" {
}
}

variable "rds_deletion_protection" {
description = "Enable deletion protection on the RDS instance."
type = bool
default = false
variable "rds_allocated_storage" {
description = "RDS PostgreSQL allocated storage in GB. If null, chosen by size_profile."
type = number
default = null
}

variable "rds_allow_major_version_upgrade" {
Expand All @@ -422,10 +422,16 @@ variable "rds_allow_major_version_upgrade" {
default = false
}

variable "rds_deletion_protection" {
description = "Enable deletion protection on the RDS instance."
type = bool
default = false
}

variable "rds_instance_class" {
description = "RDS PostgreSQL instance class"
description = "RDS PostgreSQL instance class. If null, chosen by size_profile."
type = string
default = "db.t4g.medium"
default = null
}

variable "rds_skip_final_snapshot" {
Expand Down Expand Up @@ -455,13 +461,17 @@ variable "size_profile" {
}

variable "starrocks_size_profile" {
description = "Sizing profile for StarRocks (FE/CN pods and dedicated EKS node pool). If null, falls back to size_profile."
description = "StarRocks (AI Lake) sizing profile. Required when enable_ai_lake is true; one of: dev, prod-small, prod-xl. Not derived from size_profile."
type = string
default = null
validation {
condition = var.starrocks_size_profile == null || contains(["dev", "prod-small", "prod-xl"], coalesce(var.starrocks_size_profile, "dev"))
condition = var.starrocks_size_profile == null || contains(["dev", "prod-small", "prod-xl"], var.starrocks_size_profile)
error_message = "starrocks_size_profile must be one of: dev, prod-small, prod-xl."
}
validation {
condition = !var.enable_ai_lake || var.starrocks_size_profile != null
error_message = "starrocks_size_profile must be set when enable_ai_lake is true."
}
}

variable "starrocks_cn_image_tag" {
Expand Down
Loading