ceph/monitoring/ceph-mixin/dashboards/rgw.libsonnet

local g = import 'grafonnet/grafana.libsonnet';
local u = import 'utils.libsonnet';

{
  grafanaDashboards+:: {
    'radosgw-sync-overview.json':
      local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) =
        u.graphPanelSchema({},
                           title,
                           '',
                           'null as zero',
                           true,
                           formatY1,
                           'short',
                           labelY1,
                           null,
                           0,
                           1,
                           '$datasource')
        .addTargets(
          [u.addTargetSchema('sum by (source_zone) (rate(%s[30s]))' % rgwMetric,
                             1,
                             'time_series',
                             '{{source_zone}}')]
        ) + { gridPos: { x: x, y: y, w: w, h: h } };

      u.dashboardSchema(
        'RGW Sync Overview',
        '',
        'rgw-sync-overview',
        'now-1h',
        '15s',
        16,
        ['overview'],
        '',
        {
          refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
          time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
        }
      )
      .addAnnotation(
        u.addAnnotationSchema(
          1,
          '-- Grafana --',
          true,
          true,
          'rgba(0, 211, 255, 1)',
          'Annotations & Alerts',
          'dashboard'
        )
      )
      .addRequired(
        type='grafana', id='grafana', name='Grafana', version='5.0.0'
      )
      .addRequired(
        type='panel', id='graph', name='Graph', version='5.0.0'
      )
      .addTemplate(
        u.addTemplateSchema('rgw_servers', '$datasource', 'prometehus', 1, true, 1, '', '')
      )
      .addTemplate(
        g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
      )
      .addPanels([
        RgwSyncOverviewPanel(
          'Replication (throughput) from Source Zone',
          'Bps',
          null,
          'ceph_data_sync_from_zone_fetch_bytes_sum',
          0,
          0,
          8,
          7
        ),
        RgwSyncOverviewPanel(
          'Replication (objects) from Source Zone',
          'short',
          'Objects/s',
          'ceph_data_sync_from_zone_fetch_bytes_count',
          8,
          0,
          8,
          7
        ),
        RgwSyncOverviewPanel(
          'Polling Request Latency from Source Zone',
          'ms',
          null,
          'ceph_data_sync_from_zone_poll_latency_sum',
          16,
          0,
          8,
          7
        ),
        RgwSyncOverviewPanel(
          'Unsuccessful Object Replications from Source Zone',
          'short',
          'Count/s',
          'ceph_data_sync_from_zone_fetch_errors',
          0,
          7,
          8,
          7
        ),
      ]),
    'radosgw-overview.json':
      local RgwOverviewPanel(
        title,
        description,
        formatY1,
        formatY2,
        expr1,
        legendFormat1,
        x,
        y,
        w,
        h,
        datasource='$datasource',
        legend_alignAsTable=false,
        legend_avg=false,
        legend_min=false,
        legend_max=false,
        legend_current=false,
        legend_values=false
            ) =
        u.graphPanelSchema(
          {},
          title,
          description,
          'null',
          false,
          formatY1,
          formatY2,
          null,
          null,
          0,
          1,
          datasource,
          legend_alignAsTable,
          legend_avg,
          legend_min,
          legend_max,
          legend_current,
          legend_values
        )
        .addTargets(
          [u.addTargetSchema(expr1, 1, 'time_series', legendFormat1)]
        ) + { gridPos: { x: x, y: y, w: w, h: h } };

      u.dashboardSchema(
        'RGW Overview',
        '',
        'WAkugZpiz',
        'now-1h',
        '15s',
        16,
        ['overview'],
        '',
        {
          refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
          time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
        }
      )
      .addAnnotation(
        u.addAnnotationSchema(
          1,
          '-- Grafana --',
          true,
          true,
          'rgba(0, 211, 255, 1)',
          'Annotations & Alerts',
          'dashboard'
        )
      )
      .addRequired(
        type='grafana', id='grafana', name='Grafana', version='5.0.0'
      )
      .addRequired(
        type='panel', id='graph', name='Graph', version='5.0.0'
      )
      .addTemplate(
        u.addTemplateSchema(
          'rgw_servers',
          '$datasource',
          'label_values(ceph_rgw_metadata, ceph_daemon)',
          1,
          true,
          1,
          '',
          ''
        )
      )
      .addTemplate(
        u.addTemplateSchema(
          'code',
          '$datasource',
          'label_values(haproxy_server_http_responses_total{instance=~"$ingress_service"}, code)',
          1,
          true,
          1,
          'HTTP Code',
          ''
        )
      )
      .addTemplate(
        u.addTemplateSchema(
          'ingress_service',
          '$datasource',
          'label_values(haproxy_server_status, instance)',
          1,
          true,
          1,
          'Ingress Service',
          ''
        )
      )
      .addTemplate(
        g.template.datasource('datasource',
                              'prometheus',
                              'default',
                              label='Data Source')
      )
      .addPanels([
        u.addRowSchema(false,
                       true,
                       'RGW Overview - All Gateways') +
        {
          gridPos: { x: 0, y: 0, w: 24, h: 1 },
        },
        RgwOverviewPanel(
          'Average GET/PUT Latencies',
          '',
          's',
          'short',
          'rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata',
          'GET AVG',
          0,
          1,
          8,
          7
        ).addTargets(
          [
            u.addTargetSchema(
              'rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata',
              1,
              'time_series',
              'PUT AVG'
            ),
          ]
        ),
        RgwOverviewPanel(
          'Total Requests/sec by RGW Instance',
          '',
          'none',
          'short',
          'sum by (rgw_host) (label_replace(rate(ceph_rgw_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"))',
          '{{rgw_host}}',
          8,
          1,
          7,
          7
        ),
        RgwOverviewPanel(
          'GET Latencies by RGW Instance',
          'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts',
          's',
          'short',
          'label_replace(\n    rate(ceph_rgw_get_initial_lat_sum[30s]) /\n    rate(ceph_rgw_get_initial_lat_count[30s]) *\n    on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
          '{{rgw_host}}',
          15,
          1,
          6,
          7
        ),
        RgwOverviewPanel(
          'Bandwidth Consumed by Type',
          'Total bytes transferred in/out of all radosgw instances within the cluster',
          'bytes',
          'short',
          'sum(rate(ceph_rgw_get_b[30s]))',
          'GETs',
          0,
          8,
          8,
          6
        ).addTargets(
          [u.addTargetSchema('sum(rate(ceph_rgw_put_b[30s]))',
                             1,
                             'time_series',
                             'PUTs')]
        ),
        RgwOverviewPanel(
          'Bandwidth by RGW Instance',
          'Total bytes transferred in/out through get/put operations, by radosgw instance',
          'bytes',
          'short',
          'label_replace(sum by (instance_id) (\n    rate(ceph_rgw_get_b[30s]) + \n    rate(ceph_rgw_put_b[30s])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
          '{{rgw_host}}',
          8,
          8,
          7,
          6
        ),
        RgwOverviewPanel(
          'PUT Latencies by RGW Instance',
          'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts',
          's',
          'short',
          'label_replace(\n    rate(ceph_rgw_put_initial_lat_sum[30s]) /\n    rate(ceph_rgw_put_initial_lat_count[30s]) *\n    on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
          '{{rgw_host}}',
          15,
          8,
          6,
          6
        ),
        u.addRowSchema(
          false, true, 'RGW Overview - HAProxy Metrics'
        ) + { gridPos: { x: 0, y: 12, w: 9, h: 12 } },
        RgwOverviewPanel(
          'Total responses by HTTP code',
          '',
          'short',
          'short',
          'sum(irate(haproxy_frontend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"frontend"}[5m])) by (code)',
          'Frontend {{ code }}',
          0,
          12,
          5,
          12,
          '$datasource',
          true,
          true,
          true,
          true,
          true,
          true
        )
        .addTargets(
          [u.addTargetSchema('sum(irate(haproxy_backend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"backend"}[5m])) by (code)', 1, 'time_series', 'Backend {{ code }}')]
        )
        .addSeriesOverride([
          {
            alias: '/.*Back.*/',
            transform: 'negative-Y',
          },
          { alias: '/.*1.*/' },
          { alias: '/.*2.*/' },
          { alias: '/.*3.*/' },
          { alias: '/.*4.*/' },
          { alias: '/.*5.*/' },
          { alias: '/.*other.*/' },
        ]),
        RgwOverviewPanel(
          'Total requests / responses',
          '',
          'short',
          'short',
          'sum(irate(haproxy_frontend_http_requests_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)',
          'Requests',
          5,
          12,
          5,
          12,
          '$datasource',
          true,
          true,
          true,
          true,
          true,
          true
        )
        .addTargets(
          [
            u.addTargetSchema('sum(irate(haproxy_backend_response_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Response errors'),
            u.addTargetSchema('sum(irate(haproxy_frontend_request_errors_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Requests errors'),
            u.addTargetSchema('sum(irate(haproxy_backend_redispatch_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend redispatch'),
            u.addTargetSchema('sum(irate(haproxy_backend_retry_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend retry'),
            u.addTargetSchema('sum(irate(haproxy_frontend_requests_denied_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Request denied'),
            u.addTargetSchema('sum(haproxy_backend_current_queue{proxy=~"backend",instance=~"$ingress_service"}) by (instance)', 2, 'time_series', 'Backend Queued'),
          ]
        )
        .addSeriesOverride([
          {
            alias: '/.*Response.*/',
            transform: 'negative-Y',
          },
          {
            alias: '/.*Backend.*/',
            transform: 'negative-Y',
          },
        ]),
        RgwOverviewPanel(
          'Total number of connections',
          '',
          'short',
          'short',
          'sum(irate(haproxy_frontend_connections_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)',
          'Front',
          10,
          12,
          5,
          12,
          '$datasource',
          true,
          true,
          true,
          true,
          true,
          true
        )
        .addTargets(
          [
            u.addTargetSchema('sum(irate(haproxy_backend_connection_attempts_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back'),
            u.addTargetSchema('sum(irate(haproxy_backend_connection_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back errors'),
          ]
        )
        .addSeriesOverride([
          {
            alias: '/.*Back.*/',
            transform: 'negative-Y',
          },
        ]),
        RgwOverviewPanel(
          'Current total of incoming / outgoing bytes',
          '',
          'short',
          'short',
          'sum(irate(haproxy_frontend_bytes_in_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)',
          'IN Front',
          15,
          12,
          6,
          12,
          '$datasource',
          true,
          true,
          true,
          true,
          true,
          true
        )
        .addTargets(
          [
            u.addTargetSchema('sum(irate(haproxy_frontend_bytes_out_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Front'),
            u.addTargetSchema('sum(irate(haproxy_backend_bytes_in_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'IN Back'),
            u.addTargetSchema('sum(irate(haproxy_backend_bytes_out_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Back'),
          ]
        )
        .addSeriesOverride([
          {
            alias: '/.*OUT.*/',
            transform: 'negative-Y',
          },
        ]),
      ]),
    'radosgw-detail.json':
      local RgwDetailsPanel(aliasColors,
                            title,
                            description,
                            formatY1,
                            formatY2,
                            expr1,
                            expr2,
                            legendFormat1,
                            legendFormat2,
                            x,
                            y,
                            w,
                            h) =
        u.graphPanelSchema(aliasColors,
                           title,
                           description,
                           'null',
                           false,
                           formatY1,
                           formatY2,
                           null,
                           null,
                           0,
                           1,
                           '$datasource')
        .addTargets(
          [u.addTargetSchema(expr1, 1, 'time_series', legendFormat1), u.addTargetSchema(expr2, 1, 'time_series', legendFormat2)]
        ) + { gridPos: { x: x, y: y, w: w, h: h } };

      u.dashboardSchema(
        'RGW Instance Detail',
        '',
        'x5ARzZtmk',
        'now-1h',
        '15s',
        16,
        ['overview'],
        '',
        {
          refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
          time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
        }
      )
      .addAnnotation(
        u.addAnnotationSchema(
          1,
          '-- Grafana --',
          true,
          true,
          'rgba(0, 211, 255, 1)',
          'Annotations & Alerts',
          'dashboard'
        )
      )
      .addRequired(
        type='grafana', id='grafana', name='Grafana', version='5.0.0'
      )
      .addRequired(
        type='panel',
        id='grafana-piechart-panel',
        name='Pie Chart',
        version='1.3.3'
      )
      .addRequired(
        type='panel', id='graph', name='Graph', version='5.0.0'
      )
      .addTemplate(
        g.template.datasource('datasource',
                              'prometheus',
                              'default',
                              label='Data Source')
      )
      .addTemplate(
        u.addTemplateSchema('rgw_servers',
                            '$datasource',
                            'label_values(ceph_rgw_metadata, ceph_daemon)',
                            1,
                            true,
                            1,
                            '',
                            '')
      )
      .addPanels([
        u.addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
        RgwDetailsPanel(
          {},
          '$rgw_servers GET/PUT Latencies',
          '',
          's',
          'short',
          'sum by (instance_id) (rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
          'sum by (instance_id) (rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
          'GET {{ceph_daemon}}',
          'PUT {{ceph_daemon}}',
          0,
          1,
          6,
          8
        ),
        RgwDetailsPanel(
          {},
          'Bandwidth by HTTP Operation',
          '',
          'bytes',
          'short',
          'rate(ceph_rgw_get_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
          'rate(ceph_rgw_put_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
          'GETs {{ceph_daemon}}',
          'PUTs {{ceph_daemon}}',
          6,
          1,
          7,
          8
        ),
        RgwDetailsPanel(
          {
            GETs: '#7eb26d',
            Other: '#447ebc',
            PUTs: '#eab839',
            Requests: '#3f2b5b',
            'Requests Failed': '#bf1b00',
          },
          'HTTP Request Breakdown',
          '',
          'short',
          'short',
          'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
          'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
          'Requests Failed {{ceph_daemon}}',
          'GETs {{ceph_daemon}}',
          13,
          1,
          7,
          8
        )
        .addTargets(
          [
            u.addTargetSchema(
              'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
              1,
              'time_series',
              'PUTs {{ceph_daemon}}'
            ),
            u.addTargetSchema(
              '(\n    rate(ceph_rgw_req[30s]) -\n    (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
              1,
              'time_series',
              'Other {{ceph_daemon}}'
            ),
          ]
        ),
        u.addPieChartSchema(
          {
            GETs: '#7eb26d',
            'Other (HEAD,POST,DELETE)': '#447ebc',
            PUTs: '#eab839',
            Requests: '#3f2b5b',
            Failures: '#bf1b00',
          }, '$datasource', '', 'Under graph', 'pie', 'Workload Breakdown', 'current'
        )
        .addTarget(u.addTargetSchema(
          'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
          1,
          'time_series',
          'Failures {{ceph_daemon}}'
        ))
        .addTarget(u.addTargetSchema(
          'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
          1,
          'time_series',
          'GETs {{ceph_daemon}}'
        ))
        .addTarget(u.addTargetSchema(
          'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
          1,
          'time_series',
          'PUTs {{ceph_daemon}}'
        ))
        .addTarget(u.addTargetSchema(
          '(\n    rate(ceph_rgw_req[30s]) -\n    (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
          1,
          'time_series',
          'Other (DELETE,LIST) {{ceph_daemon}}'
        )) + { gridPos: { x: 20, y: 1, w: 4, h: 8 } },
      ]),
  },
}