//   Copyright 2016 DigitalOcean
//
//   Licensed under the Apache License, Version 2.0 (the "License");
//   you may not use this file except in compliance with the License.
//   You may obtain a copy of the License at
//
//       http://www.apache.org/licenses/LICENSE-2.0
//
//   Unless required by applicable law or agreed to in writing, software
//   distributed under the License is distributed on an "AS IS" BASIS,
//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//   See the License for the specific language governing permissions and
//   limitations under the License.

package collectors

import (
	"encoding/json"

	"github.com/prometheus/client_golang/prometheus"
	"github.com/sirupsen/logrus"
)

// MonitorCollector is used to extract stats related to monitors
// running within Ceph cluster. As we extract information pertaining
// to each monitor instance, there are various vector metrics we
// need to use.
type MonitorCollector struct {
	conn   Conn
	logger *logrus.Logger

	// TotalKBs display the total storage a given monitor node has.
	TotalKBs *prometheus.GaugeVec

	// UsedKBs depict how much of the total storage our monitor process
	// has utilized.
	UsedKBs *prometheus.GaugeVec

	// AvailKBs shows the space left unused.
	AvailKBs *prometheus.GaugeVec

	// PercentAvail shows the amount of unused space as a percentage of total
	// space.
	PercentAvail *prometheus.GaugeVec

	// Store exposes information about internal backing store.
	Store Store

	// ClockSkew shows how far the monitor clocks have skewed from each other. This
	// is an important metric because the functioning of Ceph's paxos depends on
	// the clocks being aligned as close to each other as possible.
	ClockSkew *prometheus.GaugeVec

	// Latency displays the time the monitors take to communicate between themselves.
	Latency *prometheus.GaugeVec

	// NodesinQuorum show the size of the working monitor quorum. Any change in this
	// metric can imply a significant issue in the cluster if it is not manually changed.
	NodesinQuorum prometheus.Gauge
}

// Store displays information about Monitor's FileStore. It is responsible for
// storing all the meta information about the cluster, including monmaps, osdmaps,
// pgmaps, etc. along with logs and other data.
type Store struct {
	// TotalBytes displays the current size of the FileStore.
	TotalBytes *prometheus.GaugeVec

	// SSTBytes shows the amount used by LevelDB's sorted-string tables.
	SSTBytes *prometheus.GaugeVec

	// LogBytes shows the amount used by logs.
	LogBytes *prometheus.GaugeVec

	// MiscBytes shows the amount used by miscellaneous information.
	MiscBytes *prometheus.GaugeVec
}

// NewMonitorCollector creates an instance of the MonitorCollector and instantiates
// the individual metrics that show information about the monitor processes.
func NewMonitorCollector(conn Conn, cluster string, logger *logrus.Logger) *MonitorCollector {
	labels := make(prometheus.Labels)
	labels["cluster"] = cluster

	return &MonitorCollector{
		conn:   conn,
		logger: logger,

		TotalKBs: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "monitor_capacity_bytes",
				Help:        "Total storage capacity of the monitor node",
				ConstLabels: labels,
			},
			[]string{"monitor"},
		),
		UsedKBs: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "monitor_used_bytes",
				Help:        "Storage of the monitor node that is currently allocated for use",
				ConstLabels: labels,
			},
			[]string{"monitor"},
		),
		AvailKBs: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "monitor_avail_bytes",
				Help:        "Total unused storage capacity that the monitor node has left",
				ConstLabels: labels,
			},
			[]string{"monitor"},
		),
		PercentAvail: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "monitor_avail_percent",
				Help:        "Percentage of total unused storage capacity that the monitor node has left",
				ConstLabels: labels,
			},
			[]string{"monitor"},
		),
		Store: Store{
			TotalBytes: prometheus.NewGaugeVec(
				prometheus.GaugeOpts{
					Namespace:   cephNamespace,
					Name:        "monitor_store_capacity_bytes",
					Help:        "Total capacity of the FileStore backing the monitor daemon",
					ConstLabels: labels,
				},
				[]string{"monitor"},
			),
			SSTBytes: prometheus.NewGaugeVec(
				prometheus.GaugeOpts{
					Namespace:   cephNamespace,
					Name:        "monitor_store_sst_bytes",
					Help:        "Capacity of the FileStore used only for raw SSTs",
					ConstLabels: labels,
				},
				[]string{"monitor"},
			),
			LogBytes: prometheus.NewGaugeVec(
				prometheus.GaugeOpts{
					Namespace:   cephNamespace,
					Name:        "monitor_store_log_bytes",
					Help:        "Capacity of the FileStore used only for logging",
					ConstLabels: labels,
				},
				[]string{"monitor"},
			),
			MiscBytes: prometheus.NewGaugeVec(
				prometheus.GaugeOpts{
					Namespace:   cephNamespace,
					Name:        "monitor_store_misc_bytes",
					Help:        "Capacity of the FileStore used only for storing miscellaneous information",
					ConstLabels: labels,
				},
				[]string{"monitor"},
			),
		},
		ClockSkew: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "monitor_clock_skew_seconds",
				Help:        "Clock skew the monitor node is incurring",
				ConstLabels: labels,
			},
			[]string{"monitor"},
		),
		Latency: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "monitor_latency_seconds",
				Help:        "Latency the monitor node is incurring",
				ConstLabels: labels,
			},
			[]string{"monitor"},
		),
		NodesinQuorum: prometheus.NewGauge(
			prometheus.GaugeOpts{
				Namespace:   cephNamespace,
				Name:        "monitor_quorum_count",
				Help:        "The total size of the monitor quorum",
				ConstLabels: labels,
			},
		),
	}
}

func (m *MonitorCollector) collectorList() []prometheus.Collector {
	return []prometheus.Collector{
		m.TotalKBs,
		m.UsedKBs,
		m.AvailKBs,
		m.PercentAvail,

		m.Store.TotalBytes,
		m.Store.SSTBytes,
		m.Store.LogBytes,
		m.Store.MiscBytes,

		m.ClockSkew,
		m.Latency,
	}
}

func (m *MonitorCollector) metricsList() []prometheus.Metric {
	return []prometheus.Metric{
		m.NodesinQuorum,
	}
}

type cephTimeSyncStatus struct {
	TimeChecks map[string]struct {
		Health  string      `json:"health"`
		Latency json.Number `json:"latency"`
		Skew    json.Number `json:"skew"`
	} `json:"time_skew_status"`
}

type cephMonitorStats struct {
	Health struct {
		Health struct {
			HealthServices []struct {
				Mons []struct {
					Name         string      `json:"name"`
					KBTotal      json.Number `json:"kb_total"`
					KBUsed       json.Number `json:"kb_used"`
					KBAvail      json.Number `json:"kb_avail"`
					AvailPercent json.Number `json:"avail_percent"`
					StoreStats   struct {
						BytesTotal json.Number `json:"bytes_total"`
						BytesSST   json.Number `json:"bytes_sst"`
						BytesLog   json.Number `json:"bytes_log"`
						BytesMisc  json.Number `json:"bytes_misc"`
					} `json:"store_stats"`
				} `json:"mons"`
			} `json:"health_services"`
		} `json:"health"`
		TimeChecks struct {
			Mons []struct {
				Name    string      `json:"name"`
				Skew    json.Number `json:"skew"`
				Latency json.Number `json:"latency"`
			} `json:"mons"`
		} `json:"timechecks"`
	} `json:"health"`
	Quorum []int `json:"quorum"`
}

func (m *MonitorCollector) collect() error {
	cmd := m.cephUsageCommand()
	buf, _, err := m.conn.MonCommand(cmd)
	if err != nil {
		m.logger.WithError(err).WithField(
			"args", string(cmd),
		).Error("error executing mon command")

		return err
	}

	stats := &cephMonitorStats{}
	if err := json.Unmarshal(buf, stats); err != nil {
		return err
	}

	cmd = m.cephTimeSyncStatusCommand()
	buf, _, err = m.conn.MonCommand(cmd)
	if err != nil {
		m.logger.WithError(err).WithField(
			"args", string(cmd),
		).Error("error executing mon command")

		return err
	}

	timeStats := &cephTimeSyncStatus{}
	if err := json.Unmarshal(buf, timeStats); err != nil {
		return err
	}

	// Reset daemon specifc metrics; daemons can leave the cluster
	m.TotalKBs.Reset()
	m.UsedKBs.Reset()
	m.AvailKBs.Reset()
	m.PercentAvail.Reset()
	m.Latency.Reset()
	m.ClockSkew.Reset()

	for _, healthService := range stats.Health.Health.HealthServices {
		for _, monstat := range healthService.Mons {
			kbTotal, err := monstat.KBTotal.Float64()
			if err != nil {
				return err
			}
			m.TotalKBs.WithLabelValues(monstat.Name).Set(kbTotal * 1e3)

			kbUsed, err := monstat.KBUsed.Float64()
			if err != nil {
				return err
			}
			m.UsedKBs.WithLabelValues(monstat.Name).Set(kbUsed * 1e3)

			kbAvail, err := monstat.KBAvail.Float64()
			if err != nil {
				return err
			}
			m.AvailKBs.WithLabelValues(monstat.Name).Set(kbAvail * 1e3)

			percentAvail, err := monstat.AvailPercent.Float64()
			if err != nil {
				return err
			}
			m.PercentAvail.WithLabelValues(monstat.Name).Set(percentAvail)

			storeBytes, err := monstat.StoreStats.BytesTotal.Float64()
			if err != nil {
				return err
			}
			m.Store.TotalBytes.WithLabelValues(monstat.Name).Set(storeBytes)

			sstBytes, err := monstat.StoreStats.BytesSST.Float64()
			if err != nil {
				return err
			}
			m.Store.SSTBytes.WithLabelValues(monstat.Name).Set(sstBytes)

			logBytes, err := monstat.StoreStats.BytesLog.Float64()
			if err != nil {
				return err
			}
			m.Store.LogBytes.WithLabelValues(monstat.Name).Set(logBytes)

			miscBytes, err := monstat.StoreStats.BytesMisc.Float64()
			if err != nil {
				return err
			}
			m.Store.MiscBytes.WithLabelValues(monstat.Name).Set(miscBytes)
		}
	}

	for _, monstat := range stats.Health.TimeChecks.Mons {
		skew, err := monstat.Skew.Float64()
		if err != nil {
			return err
		}
		m.ClockSkew.WithLabelValues(monstat.Name).Set(skew)

		latency, err := monstat.Latency.Float64()
		if err != nil {
			return err
		}
		m.Latency.WithLabelValues(monstat.Name).Set(latency)
	}

	for monNode, tstat := range timeStats.TimeChecks {
		skew, err := tstat.Skew.Float64()
		if err != nil {
			return err
		}
		m.ClockSkew.WithLabelValues(monNode).Set(skew)

		latency, err := tstat.Latency.Float64()
		if err != nil {
			return err
		}
		m.Latency.WithLabelValues(monNode).Set(latency)
	}

	m.NodesinQuorum.Set(float64(len(stats.Quorum)))

	return nil
}

func (m *MonitorCollector) cephUsageCommand() []byte {
	cmd, err := json.Marshal(map[string]interface{}{
		"prefix": "status",
		"format": "json",
	})
	if err != nil {
		m.logger.WithError(err).Panic("error marshalling ceph status")
	}
	return cmd
}

func (m *MonitorCollector) cephTimeSyncStatusCommand() []byte {
	cmd, err := json.Marshal(map[string]interface{}{
		"prefix": "time-sync-status",
		"format": "json",
	})
	if err != nil {
		m.logger.WithError(err).Panic("error marshalling ceph time-sync-status")
	}
	return cmd
}

// Describe sends the descriptors of each Monitor related metric we have defined
// to the channel provided.
func (m *MonitorCollector) Describe(ch chan<- *prometheus.Desc) {
	for _, metric := range m.collectorList() {
		metric.Describe(ch)
	}

	for _, metric := range m.metricsList() {
		ch <- metric.Desc()
	}
}

// Collect extracts the given metrics from the Monitors and sends it to the prometheus
// channel.
func (m *MonitorCollector) Collect(ch chan<- prometheus.Metric) {
	m.logger.Debug("collecting ceph monitor metrics")
	if err := m.collect(); err != nil {
		m.logger.WithError(err).Error("error collecting ceph monitor metrics")
		return
	}

	for _, metric := range m.collectorList() {
		metric.Collect(ch)
	}

	for _, metric := range m.metricsList() {
		ch <- metric
	}
}