首页 > 解决方案 > ECS 无法理解内存预留

问题描述

我的 ecs ec2 实例由于内存而被杀死。

我看到下面的图表,不知道发生了什么..

MemoryUtilization 显示我使用的内存不到 40%。为什么我的实例会被杀死?

为什么 MemoryReservation 只有 25%?
我有 2gig 实例(t2.small),我将软限制设置为 512mb,我猜这是 25%。那么如果超过 25%,我的任务会被杀死吗?

在此处输入图像描述

编辑

我将 memoryReservation 从 512mb 增加到 1024mb

{
  "ipcMode": null,
  "executionRoleArn": "arn:aws:iam::244842952809:role/ecsTaskExecutionRole",
  "containerDefinitions": [
    {
      "dnsSearchDomains": null,
      "environmentFiles": null,
      "logConfiguration": {
        "logDriver": "awslogs",
        "secretOptions": null,
        "options": {
          "awslogs-group": "/ecs/littlehome-web",
          "awslogs-region": "ap-northeast-2",
          "awslogs-stream-prefix": "ecs"
        }
      },
      "entryPoint": null,
      "portMappings": [
        {
          "hostPort": 80,
          "protocol": "tcp",
          "containerPort": 80
        },
        {
          "hostPort": 443,
          "protocol": "tcp",
          "containerPort": 443
        }
      ],
      "command": null,
      "linuxParameters": {
        "capabilities": null,
        "sharedMemorySize": null,
        "tmpfs": null,
        "devices": null,
        "maxSwap": 8192,
        "swappiness": 60,
        "initProcessEnabled": null
      },
      "cpu": 0,
      "environment": [],
      "resourceRequirements": null,
      "ulimits": null,
      "repositoryCredentials": {
        "credentialsParameter": "arn:aws:secretsmanager:ap-northeast-2:244842952809:secret:littlehome/dockerhub_secret-vsXHjb"
      },
      "dnsServers": null,
      "mountPoints": [],
      "workingDirectory": null,
      "secrets": null,
      "dockerSecurityOptions": null,
      "memory": null,
      "memoryReservation": 1024,
      "volumesFrom": [],
      "stopTimeout": null,
      "image": "docker.io/littlehome/littlehome",
      "startTimeout": null,
      "firelensConfiguration": null,
      "dependsOn": null,
      "disableNetworking": null,
      "interactive": null,
      "healthCheck": null,
      "essential": true,
      "links": null,
      "hostname": null,
      "extraHosts": null,
      "pseudoTerminal": null,
      "user": null,
      "readonlyRootFilesystem": null,
      "dockerLabels": null,
      "systemControls": null,
      "privileged": null,
      "name": "littlehome-web"
    }
  ],
  "placementConstraints": [],
  "memory": null,
  "taskRoleArn": "arn:aws:iam::244842952809:role/ecsTaskExecutionRole",
  "compatibilities": [
    "EC2"
  ],
  "taskDefinitionArn": "arn:aws:ecs:ap-northeast-2:244842952809:task-definition/littlehome-web:11",
  "family": "littlehome-web",
  "requiresAttributes": [
    {
      "targetId": null,
      "targetType": null,
      "value": null,
      "name": "com.amazonaws.ecs.capability.logging-driver.awslogs"
    },
    {
      "targetId": null,
      "targetType": null,
      "value": null,
      "name": "ecs.capability.execution-role-awslogs"
    },
    {
      "targetId": null,
      "targetType": null,
      "value": null,
      "name": "com.amazonaws.ecs.capability.docker-remote-api.1.19"
    },
    {
      "targetId": null,
      "targetType": null,
      "value": null,
      "name": "ecs.capability.private-registry-authentication.secretsmanager"
    },
    {
      "targetId": null,
      "targetType": null,
      "value": null,
      "name": "com.amazonaws.ecs.capability.docker-remote-api.1.21"
    },
    {
      "targetId": null,
      "targetType": null,
      "value": null,
      "name": "com.amazonaws.ecs.capability.task-iam-role"
    }
  ],
  "pidMode": null,
  "requiresCompatibilities": [
    "EC2"
  ],
  "networkMode": null,
  "cpu": null,
  "revision": 11,
  "status": "ACTIVE",
  "inferenceAccelerators": null,
  "proxyConfiguration": null,
  "volumes": []
}

我运行两个或更多 EC2 实例。

在此处输入图像描述

在此处输入图像描述

从 /var/log/messages

Oct 16 03:49:41 ip-172-31-20-96 amazon-ssm-agent: 2020-10-16 03:49:41 INFO [HealthCheck] HealthCheck reporting agent health.
Oct 16 03:50:36 ip-172-31-20-96 dhclient[3561]: XMT: Solicit on eth0, interval 109600ms.
Oct 16 03:51:31 ip-172-31-20-96 containerd: time="2020-10-16T03:51:31.670433783Z" level=info msg="shim reaped" id=e72a206c96e3a300a24f3ffe97c889ea83b9a32469bde021b96b503f2fcd044c
Oct 16 03:51:31 ip-172-31-20-96 dockerd: time="2020-10-16T03:51:31.681898014Z" level=info msg="ignoring event" module=libcontainerd namespace=moby topic=/tasks/delete type="*events.TaskDelete"
Oct 16 03:51:31 ip-172-31-20-96 kernel: veth54fda6e: renamed from eth0
Oct 16 03:51:31 ip-172-31-20-96 kernel: docker0: port 1(vetha672ca6) entered disabled state
Oct 16 03:51:31 ip-172-31-20-96 kernel: docker0: port 1(vetha672ca6) entered disabled state
Oct 16 03:51:31 ip-172-31-20-96 kernel: device vetha672ca6 left promiscuous mode
Oct 16 03:51:31 ip-172-31-20-96 kernel: docker0: port 1(vetha672ca6) entered disabled state
Oct 16 03:51:42 ip-172-31-20-96 kernel: docker0: port 1(vethbb18961) entered blocking state
Oct 16 03:51:42 ip-172-31-20-96 kernel: docker0: port 1(vethbb18961) entered disabled state
Oct 16 03:51:42 ip-172-31-20-96 kernel: device vethbb18961 entered promiscuous mode
Oct 16 03:51:42 ip-172-31-20-96 kernel: IPv6: ADDRCONF(NETDEV_UP): vethbb18961: link is not ready
Oct 16 03:51:42 ip-172-31-20-96 containerd: time="2020-10-16T03:51:42.653192466Z" level=info msg="shim containerd-shim started" address=/containerd-shim/215fd03db701ddcb24f78c1107110f46e6321610c6c8883f87fc5369d83bf52c.sock debug=false pid=19760
Oct 16 03:51:42 ip-172-31-20-96 kernel: eth0: renamed from veth7926a05
Oct 16 03:51:42 ip-172-31-20-96 kernel: IPv6: ADDRCONF(NETDEV_CHANGE): vethbb18961: link becomes ready

我在这里读到“shim reaped”可能是由内存问题引起的(https://github.com/containerd/containerd/issues/2202)并按照说明禁用transparent hugepages.

我看到他们被禁用了

[ec2-user@ip-172-31-20-96 ~]$ cat  /sys/kernel/mm/transparent_hugepage/enabled
always madvise [never]
[ec2-user@ip-172-31-20-96 ~]$ cat /sys/kernel/mm/transparent_hugepage/defrag
always defer defer+madvise madvise [never]
[ec2-user@ip-172-31-20-96 ~]$

在我将内存预留从 512mb 增加到 1024mb 后,图表显示了变化..

在此处输入图像描述

标签: amazon-web-servicesamazon-ecs

解决方案


推荐阅读