mars/kernel/lib_mapfree.c

397 lines
9.6 KiB
C

/*
* MARS Long Distance Replication Software
*
* This file is part of MARS project: http://schoebel.github.io/mars/
*
* Copyright (C) 2010-2014 Thomas Schoebel-Theuer
* Copyright (C) 2011-2014 1&1 Internet AG
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "lib_mapfree.h"
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/list.h>
#include <linux/types.h>
#include <linux/blkdev.h>
#include <linux/spinlock.h>
#include <linux/wait.h>
#include <linux/file.h>
// time to wait between background mapfree operations
int mapfree_period_sec = 10;
EXPORT_SYMBOL_GPL(mapfree_period_sec);
// some grace space where no regular cleanup should occur
int mapfree_grace_keep_mb = 16;
EXPORT_SYMBOL_GPL(mapfree_grace_keep_mb);
static
DECLARE_RWSEM(mapfree_mutex);
static
LIST_HEAD(mapfree_list);
static
void mapfree_pages(struct mapfree_info *mf, int grace_keep)
{
struct address_space *mapping;
pgoff_t start;
pgoff_t end;
if (unlikely(!mf->mf_filp || !(mapping = mf->mf_filp->f_mapping)))
goto done;
if (grace_keep < 0) { // force full flush
start = 0;
end = -1;
} else {
unsigned long flags;
loff_t tmp;
loff_t min;
traced_lock(&mf->mf_lock, flags);
min = tmp = mf->mf_min[0];
if (likely(mf->mf_min[1] < min))
min = mf->mf_min[1];
if (tmp) {
mf->mf_min[1] = tmp;
mf->mf_min[0] = 0;
}
traced_unlock(&mf->mf_lock, flags);
min -= (loff_t)grace_keep * (1024 * 1024); // megabytes
end = 0;
if (min > 0 || mf->mf_last) {
start = mf->mf_last / PAGE_SIZE;
// add some grace overlapping
if (likely(start > 0))
start--;
mf->mf_last = min;
end = min / PAGE_SIZE;
} else { // there was no progress for at least 2 rounds
start = 0;
if (!grace_keep) // also flush thoroughly
end = -1;
}
MARS_DBG("file = '%s' start = %lu end = %lu\n", SAFE_STR(mf->mf_name), start, end);
}
if (end > start || end == -1) {
invalidate_mapping_pages(mapping, start, end);
}
done:;
}
static
void _mapfree_put(struct mapfree_info *mf)
{
if (atomic_dec_and_test(&mf->mf_count)) {
MARS_DBG("closing file '%s' filp = %p\n", mf->mf_name, mf->mf_filp);
list_del_init(&mf->mf_head);
CHECK_HEAD_EMPTY(&mf->mf_dirty_anchor);
if (likely(mf->mf_filp)) {
mapfree_pages(mf, -1);
filp_close(mf->mf_filp, NULL);
}
brick_string_free(mf->mf_name);
brick_mem_free(mf);
}
}
void mapfree_put(struct mapfree_info *mf)
{
down_write(&mapfree_mutex);
_mapfree_put(mf);
up_write(&mapfree_mutex);
}
EXPORT_SYMBOL_GPL(mapfree_put);
struct mapfree_info *mapfree_get(const char *name, int flags)
{
struct mapfree_info *mf = NULL;
struct list_head *tmp;
if (!(flags & O_DIRECT)) {
down_read(&mapfree_mutex);
for (tmp = mapfree_list.next; tmp != &mapfree_list; tmp = tmp->next) {
struct mapfree_info *_mf = container_of(tmp, struct mapfree_info, mf_head);
if (_mf->mf_flags == flags && !strcmp(_mf->mf_name, name)) {
mf = _mf;
atomic_inc(&mf->mf_count);
break;
}
}
up_read(&mapfree_mutex);
if (mf)
goto done;
}
for (;;) {
struct address_space *mapping;
struct inode *inode;
int ra = 1;
int prot = 0600;
mm_segment_t oldfs;
mf = brick_zmem_alloc(sizeof(struct mapfree_info));
if (unlikely(!mf)) {
MARS_ERR("no mem, name = '%s'\n", name);
continue;
}
mf->mf_name = brick_strdup(name);
if (unlikely(!mf->mf_name)) {
MARS_ERR("no mem, name = '%s'\n", name);
brick_mem_free(mf);
continue;
}
mf->mf_flags = flags;
INIT_LIST_HEAD(&mf->mf_head);
INIT_LIST_HEAD(&mf->mf_dirty_anchor);
atomic_set(&mf->mf_count, 1);
spin_lock_init(&mf->mf_lock);
mf->mf_max = -1;
oldfs = get_fs();
set_fs(get_ds());
mf->mf_filp = filp_open(name, flags, prot);
set_fs(oldfs);
MARS_DBG("file '%s' flags = %d prot = %d filp = %p\n", name, flags, prot, mf->mf_filp);
if (unlikely(!mf->mf_filp || IS_ERR(mf->mf_filp))) {
int err = PTR_ERR(mf->mf_filp);
MARS_ERR("can't open file '%s' status=%d\n", name, err);
mf->mf_filp = NULL;
_mapfree_put(mf);
mf = NULL;
break;
}
if (unlikely(!(mapping = mf->mf_filp->f_mapping) ||
!(inode = mapping->host))) {
MARS_ERR("file '%s' has no mapping\n", name);
mf->mf_filp = NULL;
_mapfree_put(mf);
mf = NULL;
break;
}
mapping_set_gfp_mask(mapping, mapping_gfp_mask(mapping) & ~(__GFP_IO | __GFP_FS));
mf->mf_max = i_size_read(inode);
if (S_ISBLK(inode->i_mode)) {
MARS_INF("changing blkdev readahead from %lu to %d\n", inode->i_bdev->bd_disk->queue->backing_dev_info.ra_pages, ra);
inode->i_bdev->bd_disk->queue->backing_dev_info.ra_pages = ra;
}
if (flags & O_DIRECT) { // never share them
break;
}
// maintain global list of all open files
down_write(&mapfree_mutex);
for (tmp = mapfree_list.next; tmp != &mapfree_list; tmp = tmp->next) {
struct mapfree_info *_mf = container_of(tmp, struct mapfree_info, mf_head);
if (unlikely(_mf->mf_flags == flags && !strcmp(_mf->mf_name, name))) {
MARS_WRN("race on creation of '%s' detected\n", name);
_mapfree_put(mf);
mf = _mf;
atomic_inc(&mf->mf_count);
goto leave;
}
}
list_add_tail(&mf->mf_head, &mapfree_list);
leave:
up_write(&mapfree_mutex);
break;
}
done:
return mf;
}
EXPORT_SYMBOL_GPL(mapfree_get);
void mapfree_set(struct mapfree_info *mf, loff_t min, loff_t max)
{
unsigned long flags;
traced_lock(&mf->mf_lock, flags);
if (!mf->mf_min[0] || mf->mf_min[0] > min)
mf->mf_min[0] = min;
if (max >= 0 && mf->mf_max < max)
mf->mf_max = max;
traced_unlock(&mf->mf_lock, flags);
}
EXPORT_SYMBOL_GPL(mapfree_set);
static
int mapfree_thread(void *data)
{
while (!brick_thread_should_stop()) {
struct mapfree_info *mf = NULL;
struct list_head *tmp;
long long eldest = 0;
brick_msleep(500);
if (mapfree_period_sec <= 0)
continue;
down_read(&mapfree_mutex);
for (tmp = mapfree_list.next; tmp != &mapfree_list; tmp = tmp->next) {
struct mapfree_info *_mf = container_of(tmp, struct mapfree_info, mf_head);
if (unlikely(!_mf->mf_jiffies)) {
_mf->mf_jiffies = jiffies;
continue;
}
if ((long long)jiffies - _mf->mf_jiffies > mapfree_period_sec * HZ &&
(!mf || _mf->mf_jiffies < eldest)) {
mf = _mf;
eldest = _mf->mf_jiffies;
}
}
if (mf)
atomic_inc(&mf->mf_count);
up_read(&mapfree_mutex);
if (!mf) {
continue;
}
mapfree_pages(mf, mapfree_grace_keep_mb);
mf->mf_jiffies = jiffies;
mapfree_put(mf);
}
return 0;
}
////////////////// dirty IOs on the fly //////////////////
void mf_insert_dirty(struct mapfree_info *mf, struct dirty_info *di)
{
if (likely(di->dirty_mref)) {
unsigned long flags = 0;
traced_lock(&mf->mf_lock, flags);
list_del(&di->dirty_head);
list_add(&di->dirty_head, &mf->mf_dirty_anchor);
traced_unlock(&mf->mf_lock, flags);
}
}
EXPORT_SYMBOL_GPL(mf_insert_dirty);
void mf_remove_dirty(struct mapfree_info *mf, struct dirty_info *di)
{
if (!list_empty(&di->dirty_head)) {
unsigned long flags = 0;
traced_lock(&mf->mf_lock, flags);
list_del_init(&di->dirty_head);
traced_unlock(&mf->mf_lock, flags);
}
}
EXPORT_SYMBOL_GPL(mf_remove_dirty);
void mf_get_dirty(struct mapfree_info *mf, loff_t *min, loff_t *max, int min_stage, int max_stage)
{
struct list_head *tmp;
unsigned long flags = 0;
traced_lock(&mf->mf_lock, flags);
for (tmp = mf->mf_dirty_anchor.next; tmp != &mf->mf_dirty_anchor; tmp = tmp->next) {
struct dirty_info *di = container_of(tmp, struct dirty_info, dirty_head);
struct mref_object *mref = di->dirty_mref;
if (unlikely(!mref)) {
continue;
}
if (di->dirty_stage < min_stage || di->dirty_stage > max_stage) {
continue;
}
if (mref->ref_pos < *min) {
*min = mref->ref_pos;
}
if (mref->ref_pos + mref->ref_len > *max) {
*max = mref->ref_pos + mref->ref_len;
}
}
traced_unlock(&mf->mf_lock, flags);
}
EXPORT_SYMBOL_GPL(mf_get_dirty);
void mf_get_any_dirty(const char *filename, loff_t *min, loff_t *max, int min_stage, int max_stage)
{
struct list_head *tmp;
down_read(&mapfree_mutex);
for (tmp = mapfree_list.next; tmp != &mapfree_list; tmp = tmp->next) {
struct mapfree_info *mf = container_of(tmp, struct mapfree_info, mf_head);
if (!strcmp(mf->mf_name, filename)) {
mf_get_dirty(mf, min, max, min_stage, max_stage);
}
}
up_read(&mapfree_mutex);
}
EXPORT_SYMBOL_GPL(mf_get_any_dirty);
////////////////// module init stuff /////////////////////////
static
struct task_struct *mf_thread = NULL;
int __init init_mars_mapfree(void)
{
MARS_DBG("init_mapfree()\n");
mf_thread = brick_thread_create(mapfree_thread, NULL, "mars_mapfree");
if (unlikely(!mf_thread)) {
MARS_ERR("could not create mapfree thread\n");
return -ENOMEM;
}
return 0;
}
void exit_mars_mapfree(void)
{
MARS_DBG("exit_mapfree()\n");
if (likely(mf_thread)) {
brick_thread_stop(mf_thread);
mf_thread = NULL;
}
}
#ifndef CONFIG_MARS_HAVE_BIGMODULE
MODULE_DESCRIPTION("MARS mapfree infrastructure");
MODULE_AUTHOR("Thomas Schoebel-Theuer <tst@{schoebel-theuer,1und1}.de>");
MODULE_LICENSE("GPL");
module_init(init_mars_mapfree);
module_exit(exit_mars_mapfree);
#endif