Nomad version
Nomad v0.8.3 (c85483da3471f4bd3a7c3de112e95f551071769f)
Operating system and Environment details
3.10.0-327.36.3.el7.x86_64
Issue
A batch job executed, completed successfully and then several hours later, when the allocation was garbage collected was re-run.
Reproduction steps
Not sure. Seems to be happening frequently on our cluster though.
Nomad logs
2018/05/14 23:30:50.541581 [DEBUG] worker: dequeued evaluation 5e2dfa95-ce49-9e4d-621d-d0e900d6c3aa
2018/05/14 23:30:50.541765 [DEBUG] sched: <Eval "5e2dfa95-ce49-9e4d-621d-d0e900d6c3aa" JobID: "REDACTED-9d565598-63f2-4c2d-b506-5ae64d4397a2" Namespace: "default">: Total changes: (place 1) (destructive 0) (inplace 0) (stop 0)
2018/05/14 23:30:50.546101 [DEBUG] worker: submitted plan at index 355103 for evaluation 5e2dfa95-ce49-9e4d-621d-d0e900d6c3aa
2018/05/14 23:30:50.546140 [DEBUG] sched: <Eval "5e2dfa95-ce49-9e4d-621d-d0e900d6c3aa" JobID: "REDACTED-9d565598-63f2-4c2d-b506-5ae64d4397a2" Namespace: "default">: setting status to complete
2018/05/14 23:30:50.547618 [DEBUG] worker: updated evaluation <Eval "5e2dfa95-ce49-9e4d-621d-d0e900d6c3aa" JobID: "REDACTED-9d565598-63f2-4c2d-b506-5ae64d4397a2" Namespace: "default">
2018/05/14 23:30:50.547683 [DEBUG] worker: ack for evaluation 5e2dfa95-ce49-9e4d-621d-d0e900d6c3aa
2018/05/14 23:30:52.074437 [DEBUG] client: starting task runners for alloc '5d0016ac-dd71-6626-f929-6398e80ef28e'
2018/05/14 23:30:52.074769 [DEBUG] client: starting task context for 'REDACTED-task' (alloc '5d0016ac-dd71-6626-f929-6398e80ef28e')
2018-05-14T23:30:52.085-0400 [DEBUG] plugin: starting plugin: path=REDACTED/bin/nomad args="[REDACTED/nomad executor {"LogFile":"REDACTED/alloc/5d0016ac-dd71-6626-f929-6398e80ef28e/REDACTED-task/executor.out","LogLevel":"DEBUG"}]"
2018/05/14 23:34:32.288406 [INFO] client: task "REDACTED-task" for alloc "5d0016ac-dd71-6626-f929-6398e80ef28e" completed successfully
2018/05/14 23:34:32.288438 [INFO] client: Not restarting task: REDACTED-task for alloc: 5d0016ac-dd71-6626-f929-6398e80ef28e
2018/05/14 23:34:32.289213 [INFO] client.gc: marking allocation 5d0016ac-dd71-6626-f929-6398e80ef28e for GC
2018/05/15 01:39:13.888635 [INFO] client.gc: garbage collecting allocation 5d0016ac-dd71-6626-f929-6398e80ef28e due to new allocations and over max (500)
2018/05/15 01:39:15.389175 [WARN] client: failed to broadcast update to allocation "5d0016ac-dd71-6626-f929-6398e80ef28e"
2018/05/15 01:39:15.389401 [INFO] client.gc: marking allocation 5d0016ac-dd71-6626-f929-6398e80ef28e for GC
2018/05/15 01:39:15.390656 [DEBUG] client: terminating runner for alloc '5d0016ac-dd71-6626-f929-6398e80ef28e'
2018/05/15 01:39:15.390714 [DEBUG] client.gc: garbage collected "5d0016ac-dd71-6626-f929-6398e80ef28e"
2018/05/15 04:25:10.541590 [INFO] client.gc: garbage collecting allocation 5d0016ac-dd71-6626-f929-6398e80ef28e due to new allocations and over max (500)
2018/05/15 04:25:10.541626 [DEBUG] client.gc: garbage collected "5d0016ac-dd71-6626-f929-6398e80ef28e"
2018/05/15 05:46:37.119467 [DEBUG] worker: dequeued evaluation e15e469e-e4f5-2192-207b-84f6a17fd25f
2018/05/15 05:46:37.139904 [DEBUG] sched: <Eval "e15e469e-e4f5-2192-207b-84f6a17fd25f" JobID: "REDACTED-9d565598-63f2-4c2d-b506-5ae64d4397a2" Namespace: "default">: Total changes: (place 1) (destructive 0) (inplace 0) (stop 0)
2018/05/15 05:46:37.169051 [INFO] client.gc: marking allocation 5d0016ac-dd71-6626-f929-6398e80ef28e for GC
2018/05/15 05:46:37.169149 [INFO] client.gc: garbage collecting allocation 5d0016ac-dd71-6626-f929-6398e80ef28e due to forced collection
2018/05/15 05:46:37.169194 [DEBUG] client.gc: garbage collected "5d0016ac-dd71-6626-f929-6398e80ef28e"
2018/05/15 05:46:37.177470 [DEBUG] worker: submitted plan at index 373181 for evaluation e15e469e-e4f5-2192-207b-84f6a17fd25f
2018/05/15 05:46:37.177516 [DEBUG] sched: <Eval "e15e469e-e4f5-2192-207b-84f6a17fd25f" JobID: "REDACTED-9d565598-63f2-4c2d-b506-5ae64d4397a2" Namespace: "default">: setting status to complete
2018/05/15 05:46:37.179391 [DEBUG] worker: updated evaluation <Eval "e15e469e-e4f5-2192-207b-84f6a17fd25f" JobID: "REDACTED-9d565598-63f2-4c2d-b506-5ae64d4397a2" Namespace: "default">
2018/05/15 05:46:37.179783 [DEBUG] worker: ack for evaluation e15e469e-e4f5-2192-207b-84f6a17fd25f
2018/05/15 05:46:40.218701 [DEBUG] client: starting task runners for alloc '928b0562-b7ed-a3c7-d989-89519edadee9'
2018/05/15 05:46:40.218982 [DEBUG] client: starting task context for 'REDACTED-task' (alloc '928b0562-b7ed-a3c7-d989-89519edadee9')
2018-05-15T05:46:40.230-0400 [DEBUG] plugin: starting plugin: path=REDACTED/bin/nomad args="[REDACTED/nomad executor {"LogFile":"REDACTED/alloc/928b0562-b7ed-a3c7-d989-89519edadee9/REDACTED-task/executor.out","LogLevel":"DEBUG"}]"
2018/05/15 11:50:17.836313 [INFO] client: task "REDACTED-task" for alloc "928b0562-b7ed-a3c7-d989-89519edadee9" completed successfully
2018/05/15 11:50:17.836336 [INFO] client: Not restarting task: REDACTED-task for alloc: 928b0562-b7ed-a3c7-d989-89519edadee9
2018/05/15 11:50:17.836698 [INFO] client.gc: marking allocation 928b0562-b7ed-a3c7-d989-89519edadee9 for GC
Job file (if appropriate)
{
"Job": {
"AllAtOnce": false,
"Constraints": [
{
"LTarget": "${node.unique.id}",
"Operand": "=",
"RTarget": "52c7e5be-a5a0-3a34-1051-5209a91a0197"
}
],
"CreateIndex": 393646,
"Datacenters": [
"dc1"
],
"ID": "REDACTED-9d565598-63f2-4c2d-b506-5ae64d4397a2",
"JobModifyIndex": 393646,
"Meta": null,
"Migrate": null,
"ModifyIndex": 393673,
"Name": "REDACTED",
"Namespace": "default",
"ParameterizedJob": null,
"ParentID": "REDACTED/dispatch-1526033570-3cdd72d9",
"Payload": null,
"Periodic": null,
"Priority": 50,
"Region": "global",
"Reschedule": null,
"Stable": false,
"Status": "dead",
"StatusDescription": "",
"Stop": false,
"SubmitTime": 1526403442162340993,
"TaskGroups": [
{
"Constraints": [
{
"LTarget": "${attr.os.signals}",
"Operand": "set_contains",
"RTarget": "SIGTERM"
}
],
"Count": 1,
"EphemeralDisk": {
"Migrate": false,
"SizeMB": 300,
"Sticky": false
},
"Meta": null,
"Migrate": null,
"Name": "REDACTED",
"ReschedulePolicy": {
"Attempts": 1,
"Delay": 5000000000,
"DelayFunction": "constant",
"Interval": 86400000000000,
"MaxDelay": 0,
"Unlimited": false
},
"RestartPolicy": {
"Attempts": 1,
"Delay": 15000000000,
"Interval": 86400000000000,
"Mode": "fail"
},
"Tasks": [
{
"Artifacts": null,
"Config": {
"command": "REDACTED",
"args": [REDACTED]
},
"Constraints": null,
"DispatchPayload": null,
"Driver": "raw_exec",
"Env": {REDACTED},
"KillSignal": "SIGTERM",
"KillTimeout": 5000000000,
"Leader": false,
"LogConfig": {
"MaxFileSizeMB": 10,
"MaxFiles": 10
},
"Meta": null,
"Name": "REDACTED",
"Resources": {
"CPU": 100,
"DiskMB": 0,
"IOPS": 0,
"MemoryMB": 256,
"Networks": null
},
"Services": null,
"ShutdownDelay": 0,
"Templates": null,
"User": "",
"Vault": null
}
],
"Update": null
}
],
"Type": "batch",
"Update": {
"AutoRevert": false,
"Canary": 0,
"HealthCheck": "",
"HealthyDeadline": 0,
"MaxParallel": 0,
"MinHealthyTime": 0,
"Stagger": 0
},
"VaultToken": "",
"Version": 0
}
}
What I can tell you for sure is that the allocation ran to completion and exited successfully.
We're going to try turning off the reschedule and restart policies to see if that has any effect since we're taking care of re-running these on any sort of job failure anyway.