gooddata · rhefner1 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/aws/eks.tf b/aws/eks.tf
@@ -29,25 +29,7 @@ locals {
     ECRPullThroughCacheMin = aws_iam_policy.ecr_pull_through_cache_min[0].arn
   } : {}
 
-  eks_node_type_presets = {
-    dev        = ["m6a.xlarge", "m6a.2xlarge"]
-    prod-small = ["m8a.xlarge", "m8a.2xlarge"]
-    prod-large = ["m8a.xlarge", "m8a.2xlarge", "m8a.4xlarge"]
-    prod-xl    = ["m8a.xlarge", "m8a.2xlarge", "m8a.4xlarge"]
-  }
-
-  eks_starrocks_node_type_presets = {
-    dev        = ["r8a.large", "m8a.xlarge"]
-    prod-small = ["r8a.large", "r8a.xlarge"]
-    prod-xl    = ["r8a.large", "r8a.8xlarge"]
-  }
-
-  # There is no dedicated prod-large StarRocks profile; fall back to prod-xl
-  # StarRocks sizing when size_profile is prod-large (see gdcn-size-prod-large).
-  starrocks_size_profile_effective = coalesce(var.starrocks_size_profile, var.size_profile == "prod-large" ? "prod-xl" : var.size_profile)
-
-  eks_node_types           = coalesce(var.eks_node_types, local.eks_node_type_presets[var.size_profile])
-  eks_starrocks_node_types = coalesce(var.eks_starrocks_node_types, local.eks_starrocks_node_type_presets[local.starrocks_size_profile_effective])
+  # Node types / StarRocks node types / autoscaler ceiling: resolved in size-profiles.tf.
 
   # Per-AZ node groups for StarRocks so the cluster autoscaler can scale
   # nodes in the AZ where the FE/CN EBS volume lives (EBS is zonal).
@@ -130,7 +112,7 @@ module "eks" {
         }, local.ecr_pull_through_cache_policy)
 
         min_size = 0
-        max_size = var.eks_max_nodes
+        max_size = local.eks_max_nodes
 
         # This value is ignored after the initial creation
         # https://github.com/bryantbiggs/eks-desired-size-hack
@@ -174,7 +156,7 @@ module "eks" {
         }, local.ecr_pull_through_cache_policy)
 
         min_size     = 0
-        max_size     = var.eks_max_nodes
+        max_size     = local.eks_max_nodes
         desired_size = 0
       }
     },

diff --git a/aws/k8s-aws.tf b/aws/k8s-aws.tf
@@ -16,7 +16,7 @@ module "k8s_aws" {
   ingress_controller = var.ingress_controller
   dns_provider       = var.dns_provider
   route53_zone_id    = var.route53_zone_id
-  size_profile       = var.size_profile
+  ingress_replicas   = local.profile.ingress_replicas
 
   registry_k8sio = local.registry_k8sio
 

diff --git a/aws/k8s-common.tf b/aws/k8s-common.tf
@@ -49,8 +49,11 @@ module "k8s_common" {
   gdcn_license_key       = var.gdcn_license_key
   gdcn_orgs              = var.gdcn_orgs
   gdcn_helm_extra_values = var.gdcn_helm_extra_values
-  size_profile           = var.size_profile
-  starrocks_size_profile = local.starrocks_size_profile_effective
+  ingress_replicas       = local.profile.ingress_replicas
+  gdcn_size              = local.profile.gdcn_size
+  pulsar_size            = local.profile.pulsar_size
+  observability_size     = local.profile.observability_size
+  starrocks_size_profile = var.starrocks_size_profile
   cloud                  = "aws"
   ingress_controller     = var.ingress_controller
   gdcn_irsa_role_arn     = aws_iam_role.gdcn_irsa.arn

diff --git a/aws/rds.tf b/aws/rds.tf
@@ -49,11 +49,28 @@ module "rds_postgresql" {
   engine                      = "postgres"
   engine_version              = data.aws_rds_engine_version.default.version
   family                      = "postgres${split(".", data.aws_rds_engine_version.default.version)[0]}"
-  instance_class              = var.rds_instance_class
-  allocated_storage           = 20
+  instance_class              = local.rds_instance_class
+  allocated_storage           = local.rds_allocated_storage
   apply_immediately           = true
   allow_major_version_upgrade = var.rds_allow_major_version_upgrade
 
+  # Performance parameters tuned by size_profile (see size-profiles.tf). Both are
+  # dynamic (apply_method = immediate, no reboot). shared_buffers/effective_cache_size
+  # are intentionally left to the RDS instance-class defaults, which already scale
+  # with instance memory. Values are in kB.
+  parameters = [
+    {
+      name         = "work_mem"
+      value        = tostring(local.profile.postgres.work_mem_mb * 1024)
+      apply_method = "immediate"
+    },
+    {
+      name         = "maintenance_work_mem"
+      value        = tostring(local.profile.postgres.maintenance_work_mem_mb * 1024)
+      apply_method = "immediate"
+    },
+  ]
+
   # Database name & credentials
   username                    = local.db_username
   password_wo                 = local.db_password

diff --git a/aws/settings.tfvars.example b/aws/settings.tfvars.example
@@ -7,12 +7,12 @@ aws_profile_name = "my-profile"
 # Region to deploy resources to
 aws_region = "eu-central-1"
 
-# Use an existing VPC instead of creating a new one
+# Uncomment to use an existing VPC instead of creating a new one
 # existing_vpc_id             = "vpc-0123456789abcdef0"
 # existing_private_subnet_ids = ["subnet-aaa", "subnet-bbb"]
 # existing_public_subnet_ids  = ["subnet-ccc", "subnet-ddd"]
 
-# Additional tags to apply to all AWS resources
+# Uncomment to apply additional tags to all AWS resources
 # aws_additional_tags = {
 #   Environment = "dev"
 # }
@@ -26,26 +26,29 @@ deployment_name   = "gooddata-cn"
 # GoodData.CN license key
 gdcn_license_key = "key/asdf==" # provided by GoodData
 
-# Additional gooddata-cn Helm values
+# Uncomment to append extra gooddata-cn Helm values
 # gdcn_helm_extra_values = <<-EOT
 #   service:
 #     someComponent:
 #       replicaCount: 3
 # EOT
 
-# Deploys and enables GoodData generative AI services
+# Uncomment to deploy and enable GoodData generative AI services
 # enable_ai_features = true
 
-# Enable experimental features (unofficially unsupported and unstable; talk to GoodData to learn more)
+# Uncomment to enable experimental features (unofficially unsupported and unstable; talk to GoodData to learn more)
 # enable_experimental_features = true
 
-# Size profile controls replicas/resources across services:
+# Size profile:
 # - dev: lowest footprint, not HA
-# - prod-small: HA (>= 2 replica per service), resources for services increased
-# - prod-large: HA, sized for large/high-traffic deployments (more replicas +
-#   CPU/memory for the hot-path services than prod-small)
-# NOTE: switching size profile after deploying isn't supported since persistent
-# volume sizes can't be automatically adjusted in-place.
+# - prod-small: HA (>= 2 replicas per service), increased resources
+# - prod-large: HA, sized for large/high-traffic deployments
+# It sets sensible defaults for the metadata DB tier/storage, worker node
+# sizes/counts, observability disk sizes, and GoodData.CN/subchart sizing, so you
+# don't set those individually. To override one default, add its variable here
+# yourself (e.g. rds_instance_class, eks_node_types); see variables.tf for the list.
+# NOTE: changing size_profile after deploying isn't supported (persistent volume
+# sizes can't be resized in place).
 size_profile = "prod-small"
 
 ###
@@ -77,7 +80,7 @@ tls_mode           = "acm"
 # tls_mode           = "letsencrypt"
 # letsencrypt_email  = "me@example.com"
 
-# When ingress-nginx sits behind another L7 proxy/load balancer, set this to true.
+# Uncomment when ingress-nginx sits behind another L7 proxy/load balancer.
 # ingress_nginx_behind_l7 = true
 
 # Option 3 — Istio Gateway + Let's Encrypt certificates (letsencrypt)
@@ -121,8 +124,8 @@ gdcn_orgs = [
 # Uncomment to enable the observability stack
 # enable_observability = true
 # observability_hostname = "observability.gooddata.example.com"
-#
-# Observability data retention (optional; defaults shown). Each signal has its own 5Gi
+
+# Uncomment to override data retention (defaults shown). Each signal has its own 5Gi
 # PVC, so lower these to bound storage. Loki must be a multiple of 24h.
 # loki_retention_period       = "168h"
 # prometheus_retention_period = "168h"
@@ -143,15 +146,14 @@ gdcn_orgs = [
 # enable_ai_lake = true
 
 ###
-# EKS & infrastructure sizing (optional)
+# EKS & infrastructure (optional)
 ###
+# Uncomment to override EKS defaults
 # eks_version                      = "1.35"
-# eks_node_types                   = ["m6i.xlarge"]
-# eks_max_nodes                    = 10
 # eks_endpoint_public_access       = true
 # eks_endpoint_public_access_cidrs = ["x.x.x.x/32"]
 # eks_endpoint_private_access      = false
 
-# rds_instance_class      = "db.t4g.medium"
+# Uncomment to override RDS defaults
 # rds_deletion_protection = false
 # rds_skip_final_snapshot = true
diff --git a/aws/size-profiles.tf b/aws/size-profiles.tf
@@ -0,0 +1,97 @@
+###
+# Single source of truth for AWS sizing per size_profile: managed infra (RDS,
+# EKS nodes, autoscaler ceiling, ingress replicas) inline, plus workload
+# (GoodData.CN/Pulsar/observability) sizing referenced by name. StarRocks (AI
+# Lake) is sized separately via var.starrocks_size_profile. Override any managed
+# value via the matching var.* input.
+###
+
+locals {
+  size_profiles = {
+    dev = {
+      rds = {
+        instance_class    = "db.t4g.medium"
+        allocated_storage = 20
+      }
+      eks_node_types       = ["m6a.xlarge", "m6a.2xlarge"]
+      starrocks_node_types = ["r8a.large", "m8a.xlarge"]
+      eks_max_nodes        = 6
+      ingress_replicas     = 1
+      postgres = {
+        work_mem_mb             = 8
+        maintenance_work_mem_mb = 128
+      }
+      gdcn_size          = "dev"
+      pulsar_size        = "dev"
+      observability_size = "dev"
+    }
+    prod-small = {
+      rds = {
+        instance_class    = "db.r6g.large"
+        allocated_storage = 100
+      }
+      eks_node_types       = ["m8a.xlarge", "m8a.2xlarge"]
+      starrocks_node_types = ["r8a.large", "r8a.xlarge"]
+      eks_max_nodes        = 12
+      ingress_replicas     = 2
+      postgres = {
+        work_mem_mb             = 16
+        maintenance_work_mem_mb = 256
+      }
+      gdcn_size          = "prod-small"
+      pulsar_size        = "prod-small"
+      observability_size = "prod-small"
+    }
+    prod-large = {
+      rds = {
+        instance_class    = "db.r6g.xlarge"
+        allocated_storage = 100
+      }
+      eks_node_types = ["m8a.xlarge", "m8a.2xlarge", "m8a.4xlarge"]
+      # Unused: StarRocks has no prod-large tier (starrocks_size_profile can only
+      # be dev/prod-small/prod-xl), so this is never selected. Present for type
+      # consistency across the map.
+      starrocks_node_types = ["r8a.large", "r8a.8xlarge"]
+      eks_max_nodes        = 20
+      ingress_replicas     = 3
+      postgres = {
+        work_mem_mb             = 32
+        maintenance_work_mem_mb = 512
+      }
+      gdcn_size          = "prod-large"
+      pulsar_size        = "prod-large"
+      observability_size = "prod-large"
+    }
+    prod-xl = {
+      rds = {
+        instance_class    = "db.r6g.2xlarge"
+        allocated_storage = 200
+      }
+      eks_node_types       = ["m8a.xlarge", "m8a.2xlarge", "m8a.4xlarge"]
+      starrocks_node_types = ["r8a.large", "r8a.8xlarge"]
+      eks_max_nodes        = 30
+      ingress_replicas     = 3
+      postgres = {
+        work_mem_mb             = 64
+        maintenance_work_mem_mb = 1024
+      }
+      # No prod-xl GDCN/Pulsar/observability spec; fold to prod-large (explicit).
+      gdcn_size          = "prod-large"
+      pulsar_size        = "prod-large"
+      observability_size = "prod-large"
+    }
+  }
+
+  profile = local.size_profiles[var.size_profile]
+
+  # Resolved size_profile values (profile default, overridable via var.*).
+  rds_instance_class    = coalesce(var.rds_instance_class, local.profile.rds.instance_class)
+  rds_allocated_storage = coalesce(var.rds_allocated_storage, local.profile.rds.allocated_storage)
+  eks_node_types        = coalesce(var.eks_node_types, local.profile.eks_node_types)
+  eks_max_nodes         = coalesce(var.eks_max_nodes, local.profile.eks_max_nodes)
+
+  # StarRocks node pool: indexed by the explicit var.starrocks_size_profile, NOT
+  # size_profile (the two are decoupled). Only used when enable_ai_lake is true,
+  # which the variable's validation requires.
+  eks_starrocks_node_types = var.enable_ai_lake ? coalesce(var.eks_starrocks_node_types, local.size_profiles[var.starrocks_size_profile].starrocks_node_types) : []
+}
diff --git a/aws/variables.tf b/aws/variables.tf
@@ -89,9 +89,9 @@ variable "eks_endpoint_public_access_cidrs" {
 }
 
 variable "eks_max_nodes" {
-  description = "Maximum number of EKS worker nodes"
+  description = "Maximum number of EKS worker nodes (autoscaler ceiling). If null, chosen by size_profile."
   type        = number
-  default     = 20
+  default     = null
 }
 
 variable "eks_node_types" {
@@ -101,7 +101,7 @@ variable "eks_node_types" {
 }
 
 variable "eks_starrocks_node_types" {
-  description = "EC2 instance types for the StarRocks-dedicated EKS pool (taint workload=starrocks). If null, defaults to a preset chosen by starrocks_size_profile (or size_profile when unset)."
+  description = "EC2 instance types for the StarRocks-dedicated EKS pool (taint workload=starrocks). If null, defaults to a preset chosen by starrocks_size_profile."
   type        = list(string)
   default     = null
 }
@@ -410,10 +410,10 @@ variable "observability_hostname" {
   }
 }
 
-variable "rds_deletion_protection" {
-  description = "Enable deletion protection on the RDS instance."
-  type        = bool
-  default     = false
+variable "rds_allocated_storage" {
+  description = "RDS PostgreSQL allocated storage in GB. If null, chosen by size_profile."
+  type        = number
+  default     = null
 }
 
 variable "rds_allow_major_version_upgrade" {
@@ -422,10 +422,16 @@ variable "rds_allow_major_version_upgrade" {
   default     = false
 }
 
+variable "rds_deletion_protection" {
+  description = "Enable deletion protection on the RDS instance."
+  type        = bool
+  default     = false
+}
+
 variable "rds_instance_class" {
-  description = "RDS PostgreSQL instance class"
+  description = "RDS PostgreSQL instance class. If null, chosen by size_profile."
   type        = string
-  default     = "db.t4g.medium"
+  default     = null
 }
 
 variable "rds_skip_final_snapshot" {
@@ -455,13 +461,17 @@ variable "size_profile" {
 }
 
 variable "starrocks_size_profile" {
-  description = "Sizing profile for StarRocks (FE/CN pods and dedicated EKS node pool). If null, falls back to size_profile."
+  description = "StarRocks (AI Lake) sizing profile. Required when enable_ai_lake is true; one of: dev, prod-small, prod-xl. Not derived from size_profile."
   type        = string
   default     = null
   validation {
-    condition     = var.starrocks_size_profile == null || contains(["dev", "prod-small", "prod-xl"], coalesce(var.starrocks_size_profile, "dev"))
+    condition     = var.starrocks_size_profile == null || contains(["dev", "prod-small", "prod-xl"], var.starrocks_size_profile)
     error_message = "starrocks_size_profile must be one of: dev, prod-small, prod-xl."
   }
+  validation {
+    condition     = !var.enable_ai_lake || var.starrocks_size_profile != null
+    error_message = "starrocks_size_profile must be set when enable_ai_lake is true."
+  }
 }
 
 variable "starrocks_cn_image_tag" {