AVD Alerts in Terraform

Following up from my post here: https://seehad.tech/2021/08/26/add-robust-monitoring-of-azure-virtual-desktop-using-azure-monitor-alerts/ I've put these alerts into a Terraform module.  You can find the module here: https://github.com/chad-neal/avdtf-with-modules.


module rg {
  source = "../RG"
}
resource "azurerm_monitor_action_group" "email" {
  name                = "Email Desk"
  resource_group_name = module.rg.rg_name
  short_name          = "Email"
  email_receiver {
    name          = "Email"
    email_address = "Azure_Alerts@emaildomain.com"
    use_common_alert_schema = true
  }
}
resource "azurerm_monitor_activity_log_alert" "avd-service-health" {
  name                = "${var.client_name} - AVD Service Health"
  resource_group_name = module.rg.rg_name
  scopes              = [module.rg.rg_id]
  description         = "This alert will monitor AVD Service Health."
  criteria {
    category       = "ServiceHealth"
    service_health {
    events = [
      "Incident", 
      "ActionRequired", 
      "Security"
      ]
    locations = [
      "East US",
      "East US 2",
      "Global",
      "South Central US",
      "West US",
      "West US 2"
      ]
    services = ["Windows Virtual Desktop"]
  }
}
  action {
    action_group_id = azurerm_monitor_action_group.email.id
  }
}
resource "azurerm_monitor_scheduled_query_rules_alert" "avd-no-resources" {
  name                = "${var.client_name} - AVD 'No available resources'"
  location            = module.rg.rg_location
  resource_group_name = module.rg.rg_name
  data_source_id      = var.workspace_id
  description         = "This alert will monitor AVD for error 'No Available Resources'."
  action {
    action_group      = azurerm_monitor_action_group.email.id
  }
  enabled             = true
  severity            = 1
  frequency           = 15
  time_window         = 5
  query               = <<-QUERY
  WVDErrors
  | where CodeSymbolic == \"ConnectionFailedNoHealthyRdshAvailable\" and Message contains \"Could not find any SessionHost available in specified pool\"
QUERY
  trigger {
    operator          = "GreaterThan"
    threshold         = 20
  }
}
resource "azurerm_monitor_scheduled_query_rules_alert" "avd-host-mem-below-gb" {
  name                = "${var.client_name} - AVD Available Host Memory below 1GB"
  location            = module.rg.rg_location
  resource_group_name = module.rg.rg_name
  data_source_id      = var.workspace_id
  description         = "This alert will be triggered when Available Host Memory is less than 1GB."
  action {
    action_group      = azurerm_monitor_action_group.email.id
  }
  enabled             = true
  severity            = 2
  frequency           = 15
  time_window         = 5
  query               = <<-QUERY
  Perf
  | where ObjectName == \"Memory\"
  | where CounterName == \"Available Mbytes\"
  | where CounterValue <= 1024
QUERY
  trigger {
    operator          = "GreaterThanOrEqual"
    threshold         = 1
  }
}
resource "azurerm_monitor_scheduled_query_rules_alert" "avd-failed-connections" {
  name                = "${var.client_name} - AVD Failed Connections"
  location            = module.rg.rg_location
  resource_group_name = module.rg.rg_name
  data_source_id      = var.workspace_id
  description         = "This alert will be triggered when there's more than 10 failed AVD connections in 15 minutes."
  action {
    action_group      = azurerm_monitor_action_group.email.id
  }
  enabled             = true
  severity            = 2
  frequency           = 5
  time_window         = 15
  query               = <<-QUERY
WVDConnections
  | where State =~ \"Started\" and Type =~\"WVDConnections\"
  | extend Multi=split(_ResourceId, \"/\") | extend CState=iff(SessionHostOSVersion==\"<>\",\"Failure\",\"Success\")
  | where CState =~\"Failure\"
  | order by TimeGenerated desc
  | where State =~ \"Started\" | extend Multi=split(_ResourceId, \"/\")
  | project ResourceAlias, ResourceGroup=Multi[4], HostPool=Multi[8], SessionHostName, UserName, CState=iff(SessionHostOSVersion==\"<>\",\"Failure\",\"Success\"), CorrelationId, TimeGenerated
  | join kind= leftouter (WVDErrors) on CorrelationId
  | extend DurationFromLogon=datetime_diff(\"Second\",TimeGenerated1,TimeGenerated)
  | project  TimeStamp=TimeGenerated, DurationFromLogon, UserName, ResourceAlias, SessionHost=SessionHostName, Source, CodeSymbolic, ErrorMessage=Message, ErrorCode=Code, ErrorSource=Source ,ServiceError, CorrelationId
  | order by TimeStamp desc
QUERY
  trigger {
    operator          = "GreaterThanOrEqual"
    threshold         = 10
  }
}
resource "azurerm_monitor_scheduled_query_rules_alert" "avd-fslogix-errors" {
  name                = "${var.client_name} - AVD FSLogix Errors"
  location            = module.rg.rg_location
  resource_group_name = module.rg.rg_name
  data_source_id      = var.workspace_id
  description         = "This alert will be triggered when there's more than 1 FSLogix Errors in 5 minutes."
  action {
    action_group      = azurerm_monitor_action_group.email.id
  }
  enabled             = true
  severity            = 2
  frequency           = 5
  time_window         = 5
  query               = <<-QUERY
  Event 
  | where EventID == "26" and isnotnull(Message) 
  | where Message != "" 
  | where UserName != "NT AUTHORITY\\SYSTEM" 
  | order by TimeGenerated desc
QUERY
  trigger {
    operator          = "GreaterThanOrEqual"
    threshold         = 1
  }
}
resource "azurerm_monitor_scheduled_query_rules_alert" "avd-out-of-memory" {
  name                = "${var.client_name} - AVD Host Out of Memory Errors"
  location            = module.rg.rg_location
  resource_group_name = module.rg.rg_name
  data_source_id      = var.workspace_id
  description         = "This alert will be triggered when there's more than 20 Out of Memory Errors in 30 minutes."
  action {
    action_group      = azurerm_monitor_action_group.email.id
  }
  enabled             = true
  severity            = 1
  frequency           = 5
  time_window         = 30
  query               = <<-QUERY
  WVDErrors
  | where CodeSymbolic == \"OutOfMemory\" and Message contains \"The user was disconnected because the session host memory was exhausted.\"
QUERY
  trigger {
    operator          = "GreaterThanOrEqual"
    threshold         = 20
  }
}
resource "azurerm_monitor_scheduled_query_rules_alert" "avd-high-cpu" {
  name                = "${var.client_name} - AVD Host % Proc Time Greater Than 99"
  location            = module.rg.rg_location
  resource_group_name = module.rg.rg_name
  data_source_id      = var.workspace_id
  description         = "This alert will be triggered when there's more than 50 High CPU alerts in 10 minutes."
  action {
    action_group      = azurerm_monitor_action_group.email.id
  }
  enabled             = true
  severity            = 1
  frequency           = 5
  time_window         = 10
  query               = <<-QUERY
  Perf   
  | where CounterName == "% Processor Time"
  | where InstanceName == "_Total"
  | where CounterValue >= 99
QUERY
  trigger {
    operator          = "GreaterThanOrEqual"
    threshold         = 50
  }
}
resource "azurerm_monitor_metric_alert" "avd-pct-proc-pagefile" {
  name                = "${var.client_name} - AVD Pct Processor committed bytes utilization"
  resource_group_name = module.rg.rg_name
  scopes              = var.workspace_id
  description         = "Action will be triggered when Average % of Committed Bytes in Use is greater than 80."
  enabled             = false
  frequency           = "PT5M"
  window_size         = "PT5M"
  severity            = 2
  criteria {
    metric_namespace = "Microsoft.OperationalInsights/workspaces"
    metric_name      = "Average_% Committed Bytes In Use"
    aggregation      = "Maximum"
    operator         = "GreaterThanOrEqual"
    threshold        = 80
    dimension {
      name     = "ApiName"
      operator = "Include"
      values   = ["*"]
    }
  }
  action {
    action_group_id = azurerm_monitor_action_group.email.id
  }
}
resource "azurerm_monitor_metric_alert" "avd-sa-capacity" {
  name                  = "${var.client_name} - AVD Storage Account Capacity Alert"
  resource_group_name   = module.rg.rg_name
  scopes                = var.storageacct_id
  description           = "Action will be triggered when Storage Account Capacity is close to full."
  enabled               = true
  frequency             = "PT5M"
  window_size           = "PT1H"
  severity              = 1
  target_resource_type  = "Microsoft.Storage/storageAccounts/fileServices"
  target_resource_location = var.storageacct_region
  criteria {
    metric_namespace = "microsoft.storage/storageaccounts/fileservices"
    metric_name      = "FileCapacity"
    aggregation      = "Average"
    operator         = "GreaterThanOrEqual"
    threshold        = var.storageacct_threshold_bytes
    dimension {
      name     = "FileShare"
      operator = "Include"
      values   = ["fshare"]
    }
  }
  action {
    action_group_id = azurerm_monitor_action_group.email.id
  }
}

Leave a comment