Skip to content

Commit fd5c0b4

Browse files
SweetOpsconst-bon
authored andcommitted
Automatically reboot instance if hung (#3)
1 parent cdc6162 commit fd5c0b4

File tree

4 files changed

+130
-44
lines changed

4 files changed

+130
-44
lines changed

README.md

Lines changed: 45 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ Note: add `${var.ssh_key_pair}` private key to the `ssh agent`.
88

99
Include this repository as a module in your existing terraform code:
1010

11-
```
11+
```terraform
1212
module "admin_tier" {
13-
source = "git::https://github.com/cloudposse/tf_instance.git?ref=tags/0.1.0"
13+
source = "git::https://github.com/cloudposse/tf_instance.git?ref=master"
1414
ansible_playbook = "${var.ansible_playbook}"
1515
ansible_arguments = "${var.ansible_arguments}"
1616
ssh_key_pair = "${var.ssh_key_pair}"
@@ -22,6 +22,9 @@ module "admin_tier" {
2222
security_groups = ["${var.security_groups}"]
2323
subnets = ["${var.subnets}"]
2424
associate_public_ip_address = "${var.associate_public_ip_address}"
25+
name = "${var.name}"
26+
namespace = "${var.namespace}"
27+
stage = "${var.stage}"
2528
}
2629
```
2730

@@ -36,7 +39,7 @@ This module depends on these modules:
3639
It is necessary to run `terraform get` to download those modules.
3740

3841
Now reference the label when creating an instance (for example):
39-
```
42+
```terraform
4043
resource "aws_ami_from_instance" "example" {
4144
name = "terraform-example"
4245
source_instance_id = "${module.admin_tier.id}"
@@ -45,35 +48,47 @@ resource "aws_ami_from_instance" "example" {
4548

4649
## Variables
4750

48-
| Name | Default | Description | Required |
49-
|:----------------------------:|:--------------:|:--------------------------------------------------------:|:---------------:|
50-
| `namespace` | `global` | Namespace (e.g. `cp` or `cloudposse`) - required for `tf_label` module | Yes |
51-
| `stage` | `default` | Stage (e.g. `prod`, `dev`, `staging` - required for `tf_label` module | Yes |
52-
| `name` | `admin` | Name (e.g. `bastion` or `db`) - required for `tf_label` module | Yes |
53-
| `ec2_ami` | `ami-cd0f5cb6` | By default it is an AMI provided by Amazon with Ubuntu 16.04 | No |
54-
| `ssh_key_pair` | `` | SSH key pair to be provisioned on instance | Yes |
55-
| `github_api_token` | `` | GitHub API token | Yes |
56-
| `github_organization` | `` | GitHub organization name | Yes |
57-
| `github_team` | `` | GitHub team | Yes |
58-
| `ansible_playbook` | `` | Path to the playbook - required for `tf_ansible` (e.g. `./admin_tier.yml`)|Yes|
59-
| `ansible_arguments` | [] | List of ansible arguments (e.g. `["--user=ubuntu"]`) | No |
60-
| `instance_type` | `t2.micro` | The type of the creating instance (e.g. `t2.micro`) | No |
61-
| `vpc_id` | `` | The id of the VPC that the creating instance security group belongs to | Yes |
62-
| `security_groups` | [] | List of Security Group IDs allowed to connect to creating instance | Yes |
63-
| `subnets` | [] | List of VPC Subnet IDs creating instance launched in | Yes |
64-
| `associate_public_ip_address`| `true` | Associate a public ip address with the creating instance. Boolean value | No |
51+
| Name | Default | Description | Required |
52+
|:-----------------------------|:--------------------------------------------:|:---------------------------------------------------------------------------------|:--------:|
53+
| `namespace` | `global` | Namespace (e.g. `cp` or `cloudposse`) - required for `tf_label` module | Yes |
54+
| `stage` | `default` | Stage (e.g. `prod`, `dev`, `staging` - required for `tf_label` module | Yes |
55+
| `name` | `admin` | Name (e.g. `bastion` or `db`) - required for `tf_label` module | Yes |
56+
| `ec2_ami` | `ami-cd0f5cb6` | By default it is an AMI provided by Amazon with Ubuntu 16.04 | No |
57+
| `ssh_key_pair` | `` | SSH key pair to be provisioned on instance | Yes |
58+
| `github_api_token` | `` | GitHub API token | Yes |
59+
| `github_organization` | `` | GitHub organization name | Yes |
60+
| `github_team` | `` | GitHub team | Yes |
61+
| `ansible_playbook` | `` | Path to the playbook - required for `tf_ansible` (e.g. `./admin_tier.yml`) | Yes |
62+
| `ansible_arguments` | [] | List of ansible arguments (e.g. `["--user=ubuntu"]`) | No |
63+
| `instance_type` | `t2.micro` | The type of the creating instance (e.g. `t2.micro`) | No |
64+
| `vpc_id` | `` | The id of the VPC that the creating instance security group belongs to | Yes |
65+
| `security_groups` | [] | List of Security Group IDs allowed to connect to creating instance | Yes |
66+
| `subnets` | [] | List of VPC Subnet IDs creating instance launched in | Yes |
67+
| `associate_public_ip_address`| `true` | Associate a public ip address with the creating instance. Boolean value | No |
68+
| `comparison_operator` | `GreaterThanOrEqualToThreshold` | Arithmetic operation to use when comparing the specified Statistic and Threshold | Yes |
69+
| `metric_name` | `StatusCheckFailed_Instance` | Name for the alarm's associated metric | Yes |
70+
| `evaluation_periods` | `5` | Number of periods over which data is compared to the specified threshold | Yes |
71+
| `metric_namespace` | `AWS/EC2` | Namespace for the alarm's associated metric | Yes |
72+
| `applying_period` | `60` | Period in seconds over which the specified statistic is applied | Yes |
73+
| `statistic_level` | `Maximum` | Statistic to apply to the alarm's associated metric | Yes |
74+
| `metric_threshold` | `1` | Value against which the specified statistic is compared | Yes |
75+
| `default_alarm_action` |`action/actions/AWS_EC2.InstanceId.Reboot/1.0`| String of action to execute when this alarm transitions into an ALARM state | Yes |
76+
6577

66-
## Outputs
6778

68-
| Name | Decription |
69-
|:-------------------:|:-----------------------:|
70-
| `id` | Disambiguated ID |
71-
| `public_hostname` | Normalized name |
72-
| `public_ip` | Normalized namespace |
73-
| `ssh_key_pair` | Name of used AWS SSH key|
74-
| `security_group_id` | ID on the new AWS Security Group associated with creating instance|
75-
| `role` | Name of AWS IAM Role associated with creating instance|
7679

7780

81+
## Outputs
82+
83+
| Name | Description |
84+
|:--------------------|:-------------------------------------------------------------------|
85+
| `id` | Disambiguated ID |
86+
| `public_hostname` | Normalized name |
87+
| `public_ip` | Normalized namespace |
88+
| `ssh_key_pair` | Name of used AWS SSH key |
89+
| `security_group_id` | ID on the new AWS Security Group associated with creating instance |
90+
| `role` | Name of AWS IAM Role associated with creating instance |
91+
| `alarm` | CloudWatch Alarm ID |
92+
7893
## References
79-
* Thanks to https://github.com/cloudposse/tf_bastion for the inspiration
94+
* Thanks to https://github.com/cloudposse/tf_bastion for the inspiration

main.tf

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,9 @@ data "template_file" "user_data" {
7575
template = "${file("${path.module}/user_data.sh")}"
7676

7777
vars {
78-
user_data = "${join("\n", compact(concat(var.user_data, list(module.github_authorized_keys.user_data))))}"
79-
welcome_message = "${var.welcome_message}"
80-
ssh_user = "${var.ssh_user}"
78+
user_data = "${join("\n", compact(concat(var.user_data, list(module.github_authorized_keys.user_data))))}"
79+
welcome_message = "${var.welcome_message}"
80+
ssh_user = "${var.ssh_user}"
8181
}
8282
}
8383

@@ -88,7 +88,7 @@ resource "aws_instance" "default" {
8888
user_data = "${data.template_file.user_data.rendered}"
8989

9090
vpc_security_group_ids = [
91-
"${compact(concat(list(aws_security_group.default.id), var.security_groups))}"
91+
"${compact(concat(list(aws_security_group.default.id), var.security_groups))}",
9292
]
9393

9494
iam_instance_profile = "${aws_iam_instance_profile.default.name}"
@@ -118,3 +118,37 @@ module "ansible" {
118118
envs = ["host=${aws_eip.default.public_ip}"]
119119
playbook = "${var.ansible_playbook}"
120120
}
121+
122+
# Restart dead or hung instance
123+
124+
data "aws_region" "default" {
125+
current = true
126+
}
127+
128+
data "aws_caller_identity" "default" {}
129+
130+
resource "null_resource" "check_alarm_action" {
131+
triggers = {
132+
action = "arn:aws:swf:${data.aws_region.default.name}:${data.aws_caller_identity.default.account_id}:${var.default_alarm_action}"
133+
}
134+
}
135+
136+
resource "aws_cloudwatch_metric_alarm" "default" {
137+
alarm_name = "${module.label.id}"
138+
comparison_operator = "${var.comparison_operator}"
139+
evaluation_periods = "${var.evaluation_periods}"
140+
metric_name = "${var.metric_name}"
141+
namespace = "${var.metric_namespace}"
142+
period = "${var.applying_period}"
143+
statistic = "${var.statistic_level}"
144+
threshold = "${var.metric_threshold}"
145+
depends_on = ["null_resource.check_alarm_action"]
146+
147+
dimensions {
148+
InstanceId = "${aws_instance.default.id}"
149+
}
150+
151+
alarm_actions = [
152+
"${null_resource.check_alarm_action.triggers.action}",
153+
]
154+
}

outputs.tf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,7 @@ output "security_group_id" {
2929
output "role" {
3030
value = "${aws_iam_role.default.name}"
3131
}
32+
33+
output "alarm" {
34+
value = "${aws_cloudwatch_metric_alarm.default.id}"
35+
}

variables.tf

Lines changed: 43 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ variable "associate_public_ip_address" {
1515
}
1616

1717
variable "ansible_arguments" {
18-
type = "list"
18+
type = "list"
1919
default = []
2020
}
2121

@@ -33,17 +33,11 @@ variable "subnets" {
3333
type = "list"
3434
}
3535

36-
variable "namespace" {
37-
default = "global"
38-
}
36+
variable "namespace" {}
3937

40-
variable "stage" {
41-
default = "default"
42-
}
38+
variable "stage" {}
4339

44-
variable "name" {
45-
default = "admin"
46-
}
40+
variable "name" {}
4741

4842
variable "ec2_ami" {
4943
default = "ami-cd0f5cb6"
@@ -61,3 +55,42 @@ variable "ssh_user" {
6155
variable "welcome_message" {
6256
default = ""
6357
}
58+
59+
variable "comparison_operator" {
60+
description = "The arithmetic operation to use when comparing the specified Statistic and Threshold. Possible values are: GreaterThanOrEqualToThreshold, GreaterThanThreshold, LessThanThreshold, LessThanOrEqualToThreshold."
61+
default = "GreaterThanOrEqualToThreshold"
62+
}
63+
64+
variable "metric_name" {
65+
description = "The name for the alarm's associated metric. Possible values you can find in https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/ec2-metricscollected.html ."
66+
default = "StatusCheckFailed_Instance"
67+
}
68+
69+
variable "evaluation_periods" {
70+
description = "The number of periods over which data is compared to the specified threshold."
71+
default = "5"
72+
}
73+
74+
variable "metric_namespace" {
75+
description = "The namespace for the alarm's associated metric. Possible values you can find in https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/aws-namespaces.html ."
76+
default = "AWS/EC2"
77+
}
78+
79+
variable "applying_period" {
80+
description = "The period in seconds over which the specified statistic is applied."
81+
default = "60"
82+
}
83+
84+
variable "statistic_level" {
85+
description = "The statistic to apply to the alarm's associated metric. Possible values are: SampleCount, Average, Sum, Minimum, Maximum"
86+
default = "Maximum"
87+
}
88+
89+
variable "metric_threshold" {
90+
description = "The value against which the specified statistic is compared."
91+
default = "1"
92+
}
93+
94+
variable "default_alarm_action" {
95+
default = "action/actions/AWS_EC2.InstanceId.Reboot/1.0"
96+
}

0 commit comments

Comments
 (0)