From d9a6c14a899032abb191547e7a942c03c08060c7 Mon Sep 17 00:00:00 2001 From: Thomas Schoebel-Theuer Date: Wed, 23 Feb 2011 21:48:06 +0100 Subject: [PATCH] import mars-56.tgz --- Kconfig | 28 + Makefile | 6 +- brick.c | 139 +++- brick.h | 175 +++- log_format.h | 10 +- mars.h | 137 +++- mars_client.c | 509 ++++++++++++ mars_client.h | 45 ++ mars_copy.c | 572 +++++++++++++ mars_copy.h | 62 ++ mars_device_aio.c | 123 ++- mars_device_aio.h | 1 + mars_device_sio.c | 17 +- mars_dummy.c | 38 +- mars_generic.c | 666 +++++++++++++++- mars_if_device.c | 146 ++-- mars_if_device.h | 1 - mars_light.c | 1850 +++++++++++++++++++++++++++++++++++++++++++ mars_net.c | 684 ++++++++++++++++ mars_net.h | 70 ++ mars_server.c | 516 ++++++++++++ mars_server.h | 40 + mars_test.c | 8 +- mars_trans_logger.c | 155 +++- mars_trans_logger.h | 10 +- userspace/marsadm | 279 +++++++ 26 files changed, 6087 insertions(+), 200 deletions(-) create mode 100644 mars_client.c create mode 100644 mars_client.h create mode 100644 mars_copy.c create mode 100644 mars_copy.h create mode 100644 mars_light.c create mode 100644 mars_net.c create mode 100644 mars_net.h create mode 100644 mars_server.c create mode 100644 mars_server.h create mode 100644 userspace/marsadm diff --git a/Kconfig b/Kconfig index 81a89ad7..60ebaf7d 100644 --- a/Kconfig +++ b/Kconfig @@ -63,9 +63,37 @@ config MARS_TRANS_LOGGER ---help--- Experimental storage System. +config MARS_SERVER + tristate "server brick" + depends on MARS + default m + ---help--- + Experimental storage System. + +config MARS_CLIENT + tristate "client brick" + depends on MARS + default m + ---help--- + Experimental storage System. + +config MARS_COPY + tristate "copy brick" + depends on MARS + default m + ---help--- + Experimental storage System. + config MARS_TEST tristate "provisionary TEST" depends on MARS default m ---help--- Experimental storage System. + +config MARS_LIGHT + tristate "MARS Light" + depends on MARS + default m + ---help--- + Experimental storage System. diff --git a/Makefile b/Makefile index 02b51448..70e0d990 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ # Makefile for MARS # -obj-$(CONFIG_MARS) += brick.o mars_generic.o +obj-$(CONFIG_MARS) += brick.o mars_generic.o mars_net.o obj-$(CONFIG_MARS_DUMMY) += mars_dummy.o obj-$(CONFIG_MARS_CHECK) += mars_check.o obj-$(CONFIG_MARS_IF_DEVICE) += mars_if_device.o @@ -11,7 +11,11 @@ obj-$(CONFIG_MARS_DEVICE_SIO) += mars_device_sio.o obj-$(CONFIG_MARS_BUF) += mars_buf.o obj-$(CONFIG_MARS_USEBUF) += mars_usebuf.o obj-$(CONFIG_MARS_TRANS_LOGGER) += mars_trans_logger.o +obj-$(CONFIG_MARS_SERVER) += mars_server.o +obj-$(CONFIG_MARS_CLIENT) += mars_client.o +obj-$(CONFIG_MARS_COPY) += mars_copy.o obj-$(CONFIG_MARS_TEST) += mars_test.o +obj-$(CONFIG_MARS_LIGHT) += mars_light.o #mars-objs := mars_generic.o diff --git a/brick.c b/brick.c index ecc5c861..5e0edf1d 100644 --- a/brick.c +++ b/brick.c @@ -3,8 +3,10 @@ #include #include #include +#include //#define BRICK_DEBUGGING + #define USE_FREELIST #define _STRATEGY @@ -35,8 +37,8 @@ int generic_register_brick_type(const struct generic_brick_type *new_type) continue; } if (!strcmp(brick_types[i]->type_name, new_type->type_name)) { - BRICK_ERR("sorry, bricktype %s is already registered.\n", new_type->type_name); - return -EEXIST; + BRICK_DBG("bricktype %s is already registered.\n", new_type->type_name); + return 0; } } if (found < 0) { @@ -65,16 +67,21 @@ int generic_brick_init_full( const struct generic_brick_type *brick_type, const struct generic_input_type **input_types, const struct generic_output_type **output_types, - char **names) + const char **names) { struct generic_brick *brick = data; int status; int i; - BRICK_DBG("generic_brick_init_full()\n"); - // first, call the generic constructors + BRICK_DBG("brick_type = %s\n", brick_type->type_name); + if (unlikely(!data)) { + BRICK_ERR("invalid memory\n"); + return -EINVAL; + } - status = generic_brick_init(brick_type, brick, *names++); + // call the generic constructors + + status = generic_brick_init(brick_type, brick, names ? *names++ : NULL); if (status) return status; data += brick_type->brick_size; @@ -100,7 +107,7 @@ int generic_brick_init_full( struct generic_input *input = data; const struct generic_input_type *type = *input_types++; BRICK_DBG("generic_brick_init_full: calling generic_input_init()\n"); - status = generic_input_init(brick, i, type, input, names ? *names++ : type->type_name); + status = generic_input_init(brick, i, type, input, (names && *names) ? *names++ : type->type_name); if (status) return status; data += type->input_size; @@ -125,7 +132,7 @@ int generic_brick_init_full( struct generic_output *output = data; const struct generic_output_type *type = *output_types++; BRICK_DBG("generic_brick_init_full: calling generic_output_init()\n"); - generic_output_init(brick, i, type, output, names ? *names++ : type->type_name); + generic_output_init(brick, i, type, output, (names && *names) ? *names++ : type->type_name); if (status) return status; data += type->output_size; @@ -532,4 +539,120 @@ void free_generic(struct generic_object *object) } EXPORT_SYMBOL_GPL(free_generic); +///////////////////////////////////////////////////////////////// + +// helper stuff + +struct semaphore lamport_sem = __SEMAPHORE_INITIALIZER(lamport_sem, 1); // TODO: replace with spinlock if possible (first check) +struct timespec lamport_now = {}; + +void get_lamport(struct timespec *now) +{ + int diff; + + down(&lamport_sem); + + *now = CURRENT_TIME; + diff = timespec_compare(now, &lamport_now); + if (diff > 0) { + memcpy(&lamport_now, now, sizeof(lamport_now)); + } else { + timespec_add_ns(&lamport_now, 1); + memcpy(now, &lamport_now, sizeof(*now)); + } + + up(&lamport_sem); +} + +EXPORT_SYMBOL_GPL(get_lamport); + +void set_lamport(struct timespec *old) +{ + int diff; + + down(&lamport_sem); + + diff = timespec_compare(old, &lamport_now); + if (diff > 0) { + memcpy(&lamport_now, old, sizeof(lamport_now)); + } + + up(&lamport_sem); +} +EXPORT_SYMBOL_GPL(set_lamport); + +void set_button(struct generic_switch *sw, bool val) +{ + bool oldval = sw->button; + if (val != oldval) { + sw->button = val; + sw->trigger = true; + wake_up_interruptible(&sw->event); + } +} +EXPORT_SYMBOL_GPL(set_button); + +void set_led_on(struct generic_switch *sw, bool val) +{ + bool oldval = sw->led_on; + if (val != oldval) { + sw->led_on = val; + sw->trigger = true; + wake_up_interruptible(&sw->event); + } +} +EXPORT_SYMBOL_GPL(set_led_on); + +void set_led_off(struct generic_switch *sw, bool val) +{ + bool oldval = sw->led_off; + if (val != oldval) { + sw->led_off = val; + sw->trigger = true; + wake_up_interruptible(&sw->event); + } +} +EXPORT_SYMBOL_GPL(set_led_off); + + +///////////////////////////////////////////////////////////////// + +// meta stuff + +const struct meta *find_meta(const struct meta *meta, const char *field_name) +{ + const struct meta *tmp; + for (tmp = meta; tmp->field_name[0]; tmp++) { + if (!strncmp(field_name, tmp->field_name, MAX_FIELD_LEN)) { + return tmp; + } + } + return NULL; +} +EXPORT_SYMBOL_GPL(find_meta); + +void free_meta(void *data, const struct meta *meta) +{ + for (; meta->field_name[0]; meta++) { + void *item; + switch (meta->field_type) { + case FIELD_SUB: + if (meta->field_ref) { + item = data + meta->field_offset; + free_meta(item, meta->field_ref); + } + break; + case FIELD_REF: + case FIELD_STRING: + item = data + meta->field_offset; + item = *(void**)item; + if (meta->field_ref) + free_meta(item, meta->field_ref); + kfree(item); + } + } +} +EXPORT_SYMBOL_GPL(free_meta); + + MODULE_LICENSE("GPL"); diff --git a/brick.h b/brick.h index 6c75f3dc..a938c80e 100644 --- a/brick.h +++ b/brick.h @@ -2,8 +2,10 @@ #ifndef BRICK_H #define BRICK_H +#include #include #include +#include #ifdef _STRATEGY #define _STRATEGY_CODE(X) X @@ -13,14 +15,16 @@ #define _NORMAL_CODE(X) X #endif -#define BRICK_ERROR "BRICK_ERROR " __BASE_FILE__ ": " -#define BRICK_INFO "BRICK_INFO " __BASE_FILE__ ": " -#define BRICK_DEBUG "BRICK_DEBUG " __BASE_FILE__ ": " +#define BRICK_ERROR "BRICK_ERROR " +#define BRICK_INFO "BRICK_INFO " +#define BRICK_DEBUG "BRICK_DEBUG " +//#define _BRICK_FMT(fmt) "[%s] " __BASE_FILE__ " %d %s(): " fmt, current->comm, __LINE__, __FUNCTION__ +#define _BRICK_FMT(fmt) __BASE_FILE__ " %d %s(): " fmt, __LINE__, __FUNCTION__ -#define BRICK_ERR(fmt, args...) printk(BRICK_ERROR "%s(): " fmt, __FUNCTION__, ##args) -#define BRICK_INF(fmt, args...) printk(BRICK_INFO "%s(): " fmt, __FUNCTION__, ##args) +#define BRICK_ERR(fmt, args...) printk(BRICK_ERROR _BRICK_FMT(fmt), ##args) +#define BRICK_INF(fmt, args...) printk(BRICK_INFO _BRICK_FMT(fmt), ##args) #ifdef BRICK_DEBUGGING -#define BRICK_DBG(fmt, args...) printk(BRICK_DEBUG "%s(): " fmt, __FUNCTION__, ##args) +#define BRICK_DBG(fmt, args...) printk(BRICK_DEBUG _BRICK_FMT(fmt), ##args) #else #define BRICK_DBG(args...) /**/ #endif @@ -120,14 +124,23 @@ struct generic_brick_ops; struct generic_output_ops; struct generic_brick_type; +struct generic_switch { + bool button; + bool led_on; + bool led_off; + bool trigger; + wait_queue_head_t event; +}; + #define GENERIC_BRICK(BRICK) \ - char *brick_name; \ + const char *brick_name; \ const struct BRICK##_brick_type *type; \ struct BRICK##_brick_ops *ops; \ int nr_inputs; \ int nr_outputs; \ struct BRICK##_input **inputs; \ struct BRICK##_output **outputs; \ + struct generic_switch power; \ struct list_head tmp_head; \ struct generic_brick { @@ -135,7 +148,7 @@ struct generic_brick { }; #define GENERIC_INPUT(BRICK) \ - char *input_name; \ + const char *input_name; \ struct BRICK##_brick *brick; \ const struct BRICK##_input_type *type; \ struct BRICK##_output *connect; \ @@ -146,7 +159,7 @@ struct generic_input { }; #define GENERIC_OUTPUT(BRICK) \ - char *output_name; \ + const char *output_name; \ struct BRICK##_brick *brick; \ const struct BRICK##_output_type *type; \ struct BRICK##_output_ops *ops; \ @@ -176,7 +189,7 @@ struct generic_output { ) #define GENERIC_BRICK_OPS(BRICK) \ - int (*brick_switch)(struct BRICK##_brick *brick, bool state); \ + int (*brick_switch)(struct BRICK##_brick *brick); \ struct generic_brick_ops { GENERIC_BRICK_OPS(generic); @@ -193,14 +206,14 @@ struct generic_output_ops { // although possible, *_type should never be extended #define GENERIC_BRICK_TYPE(BRICK) \ - char *type_name; \ + const char *type_name; \ int brick_size; \ int max_inputs; \ int max_outputs; \ const struct BRICK##_input_type **default_input_types; \ - char **default_input_names; \ + const char **default_input_names; \ const struct BRICK##_output_type **default_output_types; \ - char **default_output_names; \ + const char **default_output_names; \ struct BRICK##_brick_ops *master_ops; \ const struct BRICK##_input_types **default_type; \ int (*brick_construct)(struct BRICK##_brick *brick); \ @@ -242,7 +255,7 @@ struct generic_output_type { int generic_register_brick_type(const struct generic_brick_type *new_type); int generic_unregister_brick_type(const struct generic_brick_type *old_type); -extern inline void _generic_output_init(struct generic_brick *brick, const struct generic_output_type *type, struct generic_output *output, char *output_name) +inline void _generic_output_init(struct generic_brick *brick, const struct generic_output_type *type, struct generic_output *output, const char *output_name) { output->output_name = output_name; output->brick = brick; @@ -254,18 +267,22 @@ extern inline void _generic_output_init(struct generic_brick *brick, const struc #ifdef _STRATEGY // call this only in strategy bricks, never in ordinary bricks // you need this only if you circumvent generic_brick_init_full() -extern inline int generic_brick_init(const struct generic_brick_type *type, struct generic_brick *brick, char *brick_name) +inline int generic_brick_init(const struct generic_brick_type *type, struct generic_brick *brick, const char *brick_name) { brick->brick_name = brick_name; brick->type = type; brick->ops = type->master_ops; brick->nr_inputs = 0; brick->nr_outputs = 0; + brick->power.led_off = true; + //brick->power.event = __WAIT_QUEUE_HEAD_INITIALIZER(brick->power.event); + init_waitqueue_head(&brick->power.event); + //INIT_LIST_HEAD(&brick->tmp_head); brick->tmp_head.next = brick->tmp_head.prev = &brick->tmp_head; return 0; } -extern inline int generic_input_init(struct generic_brick *brick, int index, const struct generic_input_type *type, struct generic_input *input, char *input_name) +inline int generic_input_init(struct generic_brick *brick, int index, const struct generic_input_type *type, struct generic_input *input, const char *input_name) { if (index < 0 || index >= brick->type->max_inputs) return -ENOMEM; @@ -280,7 +297,7 @@ extern inline int generic_input_init(struct generic_brick *brick, int index, con return 0; } -extern inline int generic_output_init(struct generic_brick *brick, int index, const struct generic_output_type *type, struct generic_output *output, char *output_name) +inline int generic_output_init(struct generic_brick *brick, int index, const struct generic_output_type *type, struct generic_output *output, const char *output_name) { if (index < 0 || index >= brick->type->max_outputs) return -ENOMEM; @@ -292,7 +309,7 @@ extern inline int generic_output_init(struct generic_brick *brick, int index, co return 0; } -extern inline int generic_size(const struct generic_brick_type *brick_type) +inline int generic_size(const struct generic_brick_type *brick_type) { int size = brick_type->brick_size; int i; @@ -316,12 +333,12 @@ int generic_brick_init_full( const struct generic_brick_type *brick_type, const struct generic_input_type **input_types, const struct generic_output_type **output_types, - char **names); + const char **names); int generic_brick_exit_full( struct generic_brick *brick); -extern inline int generic_connect(struct generic_input *input, struct generic_output *output) +inline int generic_connect(struct generic_input *input, struct generic_output *output) { BRICK_DBG("generic_connect(input=%p, output=%p)\n", input, output); if (!input || !output) @@ -334,7 +351,7 @@ extern inline int generic_connect(struct generic_input *input, struct generic_ou return 0; } -extern inline int generic_disconnect(struct generic_input *input) +inline int generic_disconnect(struct generic_input *input) { BRICK_DBG("generic_disconnect(input=%p)\n", input); if (!input) @@ -426,14 +443,14 @@ static inline int BRICK##_output_init(struct BRICK##_brick *brick, int index, st \ _STRATEGY_CODE( \ \ -extern inline int INPUT_BRICK##_##OUTPUT_BRICK##_connect( \ +inline int INPUT_BRICK##_##OUTPUT_BRICK##_connect( \ struct INPUT_BRICK##_input *input, \ struct OUTPUT_BRICK##_output *output) \ { \ return generic_connect((struct generic_input*)input, (struct generic_output*)output); \ } \ \ -extern inline int INPUT_BRICK##_##OUTPUT_BRICK####_disconnect( \ +inline int INPUT_BRICK##_##OUTPUT_BRICK####_disconnect( \ struct INPUT_BRICK##_input *input) \ { \ return generic_disconnect((struct generic_input*)input); \ @@ -456,7 +473,7 @@ extern void free_generic(struct generic_object *object); #define GENERIC_OBJECT_LAYOUT_FUNCTIONS(BRICK) \ \ -extern inline int BRICK##_init_object_layout(struct BRICK##_output *output, struct generic_object_layout *object_layout, int aspect_max, const struct generic_object_type *object_type) \ +inline int BRICK##_init_object_layout(struct BRICK##_output *output, struct generic_object_layout *object_layout, int aspect_max, const struct generic_object_type *object_type) \ { \ if (likely(object_layout->object_type)) \ return 0; \ @@ -465,7 +482,7 @@ extern inline int BRICK##_init_object_layout(struct BRICK##_output *output, stru #define GENERIC_ASPECT_LAYOUT_FUNCTIONS(BRICK,TYPE) \ \ -extern inline int BRICK##_##TYPE##_add_aspect(struct BRICK##_output *output, struct TYPE##_object_layout *object_layout, const struct generic_aspect_type *aspect_type) \ +inline int BRICK##_##TYPE##_add_aspect(struct BRICK##_output *output, struct TYPE##_object_layout *object_layout, const struct generic_aspect_type *aspect_type) \ { \ int res = generic_add_aspect((struct generic_output*)output, (struct generic_object_layout *)object_layout, aspect_type); \ BRICK_DBG(#BRICK " " #TYPE "added aspect_type %p (%s) to object_layout %p (type %s) on output %p (type %s), status=%d\n", aspect_type, aspect_type->aspect_type_name, object_layout, object_layout->object_type->object_type_name, output, output->type->type_name, res); \ @@ -474,7 +491,7 @@ extern inline int BRICK##_##TYPE##_add_aspect(struct BRICK##_output *output, str #define GENERIC_OBJECT_FUNCTIONS(TYPE) \ \ -extern inline struct TYPE##_object *TYPE##_construct(void *data, struct TYPE##_object_layout *object_layout) \ +inline struct TYPE##_object *TYPE##_construct(void *data, struct TYPE##_object_layout *object_layout) \ { \ struct TYPE##_object *obj = data; \ int i; \ @@ -504,7 +521,7 @@ extern inline struct TYPE##_object *TYPE##_construct(void *data, struct TYPE##_o return obj; \ } \ \ -extern inline void TYPE##_destruct(struct TYPE##_object *obj) \ +inline void TYPE##_destruct(struct TYPE##_object *obj) \ { \ struct TYPE##_object_layout *object_layout = obj->object_layout; \ int i; \ @@ -532,7 +549,7 @@ extern inline void TYPE##_destruct(struct TYPE##_object *obj) \ #define GENERIC_ASPECT_FUNCTIONS(BRICK,TYPE) \ \ -extern inline struct BRICK##_##TYPE##_aspect *BRICK##_##TYPE##_get_aspect(struct BRICK##_output *output, struct TYPE##_object *obj) \ +inline struct BRICK##_##TYPE##_aspect *BRICK##_##TYPE##_get_aspect(struct BRICK##_output *output, struct TYPE##_object *obj) \ { \ struct generic_object_layout *object_layout; \ struct generic_aspect_layout *aspect_layout; \ @@ -548,12 +565,12 @@ extern inline struct BRICK##_##TYPE##_aspect *BRICK##_##TYPE##_get_aspect(struct return (void*)obj + aspect_layout->aspect_offset; \ } \ \ -extern inline int BRICK##_##TYPE##_init_object_layout(struct BRICK##_output *output, struct generic_object_layout *object_layout) \ +inline int BRICK##_##TYPE##_init_object_layout(struct BRICK##_output *output, struct generic_object_layout *object_layout) \ { \ return BRICK##_init_object_layout(output, object_layout, 32, &TYPE##_type); \ } \ \ -extern inline struct TYPE##_object *BRICK##_alloc_##TYPE(struct BRICK##_output *output, struct generic_object_layout *object_layout) \ +inline struct TYPE##_object *BRICK##_alloc_##TYPE(struct BRICK##_output *output, struct generic_object_layout *object_layout) \ { \ int status = BRICK##_##TYPE##_init_object_layout(output, object_layout); \ if (status < 0) \ @@ -561,12 +578,12 @@ extern inline struct TYPE##_object *BRICK##_alloc_##TYPE(struct BRICK##_output * return (struct TYPE##_object*)alloc_generic(object_layout); \ } \ \ -extern inline struct TYPE##_object *BRICK##_alloc_##TYPE##_pure(struct generic_object_layout *object_layout) \ +inline struct TYPE##_object *BRICK##_alloc_##TYPE##_pure(struct generic_object_layout *object_layout) \ { \ return (struct TYPE##_object*)alloc_generic(object_layout); \ } \ \ -extern inline void BRICK##_free_##TYPE(struct TYPE##_object *object) \ +inline void BRICK##_free_##TYPE(struct TYPE##_object *object) \ { \ free_generic((struct generic_object*)object); \ } \ @@ -577,12 +594,19 @@ GENERIC_OBJECT_FUNCTIONS(generic); /////////////////////////////////////////////////////////////////////// -// some helpers +// some general helpers +extern void get_lamport(struct timespec *now); +extern void set_lamport(struct timespec *old); + + + +#if 0 #undef spin_lock_irqsave #define spin_lock_irqsave(l,f) spin_lock(l) #undef spin_unlock_irqrestore #define spin_unlock_irqrestore(l,f) spin_unlock(l) +#endif #ifdef CONFIG_DEBUG_SPINLOCK @@ -654,4 +678,89 @@ GENERIC_OBJECT_FUNCTIONS(generic); # define traced_writeunlock(spinlock,flags) write_unlock_irqrestore(spinlock,flags) #endif + +extern void set_button(struct generic_switch *sw, bool val); +extern void set_led_on(struct generic_switch *sw, bool val); +extern void set_led_off(struct generic_switch *sw, bool val); + +///////////////////////////////////////////////////////////////////////// + +// metadata descriptions + +/* The idea is to describe your C structures in such a way that + * transfers to disk or over a network become self-describing. + * + * In essence, this is a kind of version-independent marshalling. + * + * Advantage: + * When you extend your original C struct (and of course update the + * corresponding meta structure), old data on disk (or network peers + * running an old version of your program) will remain valid. + * Upon read, newly added fields missing in the old version will be simply + * not filled in and therefore remain zeroed (if you don't forget to + * initially clear your structures via memset() / initializers / etc). + * Note that this works only if you never rename or remove existing + * fields; you should only add new ones. + * [TODO: add macros for description of ignored / renamed fields to + * overcome this limitation] + * You may increase the size of integers, for example from 32bit to 64bit + * or even higher; sign extension will be automatically carried out + * when necessary. + * [TODO; NYI] + * Also, you may change the order of fields, because the metadata interpreter + * will check each field individually; field offsets are automatically + * maintained. + * + * Disadvantage: this adds some (small) overhead. + */ + +#define MAX_FIELD_LEN 24 + +enum field_type { + FIELD_DONE, + FIELD_REF, + FIELD_SUB, + FIELD_STRING, + FIELD_RAW, + FIELD_INT, + FIELD_UINT, +}; + +struct meta { + char field_name[MAX_FIELD_LEN]; + int field_type; + int field_size; + int field_offset; + const struct meta *field_ref; +}; + +#define _META_INI(NAME,STRUCT,TYPE) \ + .field_name = #NAME, \ + .field_type = TYPE, \ + .field_size = sizeof(((STRUCT*)NULL)->NAME), \ + .field_offset = offsetof(STRUCT, NAME) \ + +#define META_INI(NAME,STRUCT,TYPE) { _META_INI(NAME,STRUCT,TYPE) } + +#define _META_INI_REF(NAME,STRUCT,REF) \ + .field_name = #NAME, \ + .field_type = FIELD_REF, \ + .field_size = sizeof(*(((STRUCT*)NULL)->NAME)), \ + .field_offset = offsetof(STRUCT, NAME), \ + .field_ref = REF + +#define META_INI_REF(NAME,STRUCT,REF) { _META_INI_REF(NAME,STRUCT,REF) } + +#define _META_INI_SUB(NAME,STRUCT,SUB) \ + .field_name = #NAME, \ + .field_type = FIELD_SUB, \ + .field_size = sizeof(((STRUCT*)NULL)->NAME), \ + .field_offset = offsetof(STRUCT, NAME), \ + .field_ref = SUB + +#define META_INI_SUB(NAME,STRUCT,SUB) { _META_INI_SUB(NAME,STRUCT,SUB) } + +extern const struct meta *find_meta(const struct meta *meta, const char *field_name); +extern void free_meta(void *data, const struct meta *meta); + #endif diff --git a/log_format.h b/log_format.h index e559c654..97df4bd1 100644 --- a/log_format.h +++ b/log_format.h @@ -1,4 +1,12 @@ // (c) 2010 Thomas Schoebel-Theuer / 1&1 Internet AG + +/* Definitions for logfile format. + * + * This is meant for sharing between different transaction logger variants, + * and/or for sharing with userspace tools (e.g. logfile analyzers). + * TODO: factor out kernelspace issues. + */ + #ifndef LOG_FORMAT_H #define LOG_FORMAT_H @@ -182,7 +190,7 @@ bool log_finalize(struct log_status *logst, int len, void (*endio)(struct generi DATA_PUT(data, offset, (char)0); // spare DATA_PUT(data, offset, (short)0); // spare DATA_PUT(data, offset, (int)0); // spare - now = CURRENT_TIME; // when the log entry was ready. + get_lamport(&now); // when the log entry was ready. DATA_PUT(data, offset, now.tv_sec); DATA_PUT(data, offset, now.tv_nsec); diff --git a/mars.h b/mars.h index a3ee718c..9ac969f3 100644 --- a/mars.h +++ b/mars.h @@ -3,25 +3,34 @@ #define MARS_H #include +#include + #include #include #define MARS_DELAY /**/ //#define MARS_DELAY msleep(20000) -#define MARS_FATAL "MARS_FATAL " __BASE_FILE__ ": " -#define MARS_ERROR "MARS_ERROR " __BASE_FILE__ ": " -#define MARS_INFO "MARS_INFO " __BASE_FILE__ ": " -#define MARS_DEBUG "MARS_DEBUG " __BASE_FILE__ ": " +#define MARS_FATAL "MARS_FATAL " +#define MARS_ERROR "MARS_ERROR " +#define MARS_INFO "MARS_INFO " +#define MARS_DEBUG "MARS_DEBUG " +#define _MARS_FMT(fmt) "[%s] " __BASE_FILE__ " %d %s(): " fmt, current->comm, __LINE__, __FUNCTION__ +//#define _MARS_FMT(fmt) _BRICK_FMT(fmt) -#define MARS_FAT(fmt, args...) do { printk(MARS_FATAL "%s(): " fmt, __FUNCTION__, ##args); MARS_DELAY; } while (0) -#define MARS_ERR(fmt, args...) do { printk(MARS_ERROR "%s(): " fmt, __FUNCTION__, ##args); MARS_DELAY; } while (0) -#define MARS_INF(fmt, args...) do { printk(MARS_INFO "%s(): " fmt, __FUNCTION__, ##args); } while (0) +#define MARS_FAT(fmt, args...) do { printk(MARS_FATAL _MARS_FMT(fmt), ##args); MARS_DELAY; } while (0) +#define MARS_ERR(fmt, args...) do { printk(MARS_ERROR _MARS_FMT(fmt), ##args); MARS_DELAY; } while (0) +#define MARS_INF(fmt, args...) do { printk(MARS_INFO _MARS_FMT(fmt), ##args); } while (0) #ifdef MARS_DEBUGGING -#define MARS_DBG(fmt, args...) do { printk(MARS_DEBUG "%s(): " fmt, __FUNCTION__, ##args); } while (0) +#define MARS_DBG(fmt, args...) do { printk(MARS_DEBUG _MARS_FMT(fmt), ##args); } while (0) #else #define MARS_DBG(args...) /**/ #endif +#ifdef IO_DEBUGGING +#define MARS_IO MARS_DBG +#else +#define MARS_IO(args...) /*empty*/ +#endif #define BRICK_OBJ_MREF 0 #define BRICK_OBJ_NR 1 @@ -66,6 +75,7 @@ struct mref_object_layout { /* maintained by the ref implementation, readable for callers */ \ int ref_flags; \ int ref_rw; \ + int ref_id; /* not mandatory; may be used for identification */ \ /* maintained by the ref implementation, incrementable for \ * callers (but not decrementable! use ref_put()) */ \ atomic_t ref_count; \ @@ -90,6 +100,9 @@ struct mars_info { #define MARS_BRICK(PREFIX) \ GENERIC_BRICK(PREFIX); \ + struct list_head brick_link; \ + const char *brick_path; \ + struct mars_global *global; \ struct mars_brick { MARS_BRICK(mars); @@ -214,8 +227,6 @@ static const struct generic_aspect_type *BRICK##_aspect_types[BRICK_OBJ_NR] = { MARS_ERR("%d: list_head " #head " (%p) not empty\n", __LINE__, head); \ } \ -#endif - #define CHECK_PTR(ptr,label) \ if (unlikely(!(ptr))) { \ MARS_FAT("%d: ptr " #ptr " is NULL\n", __LINE__); \ @@ -227,3 +238,109 @@ static const struct generic_aspect_type *BRICK##_aspect_types[BRICK_OBJ_NR] = { MARS_FAT("%d: condition " #ptr " is VIOLATED\n", __LINE__); \ goto label; \ } + +extern const struct meta mars_info_meta[]; +extern const struct meta mars_mref_meta[]; + +///////////////////////////////////////////////////////////////////////// + +extern struct mars_global *mars_global; + +extern void mars_trigger(void); +extern void mars_power_button(struct mars_brick *brick, bool val); +extern void mars_power_led_on(struct mars_brick *brick, bool val); +extern void mars_power_led_off(struct mars_brick *brick, bool val); + +///////////////////////////////////////////////////////////////////////// + +#ifdef _STRATEGY // call this only in strategy bricks, never in ordinary bricks + +#define MARS_ARGV_MAX 4 + +extern char *my_id(void); + +#define MARS_DENT(TYPE) \ + struct list_head sub_link; \ + struct TYPE *d_parent; \ + char *d_argv[MARS_ARGV_MAX]; /* for internal use, will be automatically deallocated*/ \ + char *d_args; /* ditto uninterpreted */ \ + char *d_name; /* current path component */ \ + char *d_rest; /* some "meaningful" rest of d_name*/ \ + char *d_path; /* full absolute path */ \ + int d_namelen; \ + int d_pathlen; \ + int d_depth; \ + unsigned int d_type; /* from readdir() => often DT_UNKNOWN => don't rely on it, use new_stat.mode instead */ \ + int d_class; /* for pre-grouping order */ \ + int d_serial; /* for pre-grouping order */ \ + int d_version; /* dynamic programming per call of mars_ent_work() */ \ + char d_error; \ + struct kstat new_stat; \ + struct kstat old_stat; \ + char *new_link; \ + char *old_link; \ + void *d_private; + +struct mars_dent { + MARS_DENT(mars_dent); +}; + +extern const struct meta mars_timespec_meta[]; +extern const struct meta mars_kstat_meta[]; +extern const struct meta mars_dent_meta[]; + +struct mars_global { + struct list_head dent_anchor; + struct list_head brick_anchor; + struct generic_switch global_power; + struct semaphore mutex; + volatile bool main_trigger; + wait_queue_head_t main_event; + //void *private; +}; + +typedef int (*mars_dent_checker)(const char *path, const char *name, int namlen, unsigned int d_type, int *prefix, int *serial); +typedef int (*mars_dent_worker)(struct mars_global *global, struct mars_dent *dent, bool direction); + +extern int mars_dent_work(struct mars_global *global, char *dirname, int allocsize, mars_dent_checker checker, mars_dent_worker worker, void *buf, int maxdepth); +extern struct mars_dent *_mars_find_dent(struct mars_global *global, const char *path); +extern struct mars_dent *mars_find_dent(struct mars_global *global, const char *path); +extern void mars_dent_free(struct mars_dent *dent); +extern void mars_dent_free_all(struct list_head *anchor); + +extern struct mars_brick *mars_find_brick(struct mars_global *global, const void *brick_type, const char *path); +extern struct mars_brick *mars_make_brick(struct mars_global *global, const void *_brick_type, const char *path, const char *name); + +#define MARS_ERR_ONCE(dent, args...) if (!dent->d_error++) MARS_ERR(args) + +/* Kludge: our kernel threads will have no mm context, but need one + * for stuff like ioctx_alloc() / aio_setup_ring() etc + * which expect userspace resources. + * We fake one. + * TODO: factor out the userspace stuff from AIO such that + * this fake is no longer necessary. + * Even better: replace do_mmap() in AIO stuff by something + * more friendly to kernelspace apps. + */ +inline void fake_mm(void) +{ + if (!current->mm) { + current->mm = &init_mm; + } +} +/* Cleanup faked mm, otherwise do_exit() will try to destroy + * the wrong thing.... + */ +inline void cleanup_mm(void) +{ + if (current->mm == &init_mm) { + current->mm = NULL; + } +} + +extern int mars_mkdir(const char *path); +extern int mars_symlink(const char *oldpath, const char *newpath, const struct timespec *stamp); +extern int mars_rename(const char *oldpath, const char *newpath); + +#endif +#endif diff --git a/mars_client.c b/mars_client.c new file mode 100644 index 00000000..f21943d4 --- /dev/null +++ b/mars_client.c @@ -0,0 +1,509 @@ +// (c) 2010 Thomas Schoebel-Theuer / 1&1 Internet AG + +// Client brick (just for demonstration) + +//#define BRICK_DEBUGGING +//#define MARS_DEBUGGING +//#define IO_DEBUGGING + +#include +#include +#include +#include + +#include "mars.h" + +///////////////////////// own type definitions //////////////////////// + +#include "mars_client.h" + +///////////////////////// own helper functions //////////////////////// + +static int thread_count = 0; + +static void _kill_socket(struct client_output *output) +{ + if (output->socket) { + MARS_DBG("shutdown socket\n"); + kernel_sock_shutdown(output->socket, SHUT_WR); + //sock_release(output->socket); + output->socket = NULL; + } +} + +static void _kill_thread(struct client_threadinfo *ti) +{ + if (ti->thread) { + kthread_stop(ti->thread); + } +} + +static int _connect(struct client_output *output, const char *str) +{ + struct sockaddr_storage sockaddr = {}; + int status; + + if (!output->host) { + output->host = kstrdup(str, GFP_MARS); + status = -EINVAL; + if (!output->host) + goto done; + output->path = strchr(output->host, '+'); + if (!output->path) { + kfree(output->host); + output->host = NULL; + goto done; + } + *output->path++ = '\0'; + } + + status = mars_create_sockaddr(&sockaddr, output->host); + if (unlikely(status < 0)) + goto done; + + status = mars_create_socket(&output->socket, &sockaddr, false); + if (unlikely(status < 0)) { + output->socket = NULL; + goto done; + } + + { + struct mars_cmd cmd = { + .cmd_code = CMD_CONNECT, + .cmd_str1 = output->path, + }; + + status = mars_send_struct(&output->socket, &cmd, mars_cmd_meta); + if (unlikely(status < 0)) + goto done; + } + if (status >= 0) { + struct mars_cmd cmd = { + .cmd_code = CMD_GETINFO, + }; + + status = mars_send_struct(&output->socket, &cmd, mars_cmd_meta); + } + +done: + if (status < 0) { + MARS_INF("cannot connect to remote host '%s' (status = %d) -- retrying\n", output->host ? output->host : "NULL", status); + _kill_socket(output); + } + return status; +} + +////////////////// own brick / input / output operations ////////////////// + +static int client_get_info(struct client_output *output, struct mars_info *info) +{ + int status; + +#if 0 + status = _connect(output, output->brick->brick_name); + if (status < 0) + goto done; +#endif + + wait_event_interruptible_timeout(output->info_event, output->got_info, 60 * HZ); + status = -EIO; + if (output->got_info && info) { + memcpy(info, &output->info, sizeof(*info)); + status = 0; + } + +//done: + return status; +} + +static int client_ref_get(struct client_output *output, struct mref_object *mref) +{ + int maxlen; + _CHECK_ATOMIC(&mref->ref_count, !=, 0); + /* Limit transfers to page boundaries. + * Currently, this is more restrictive than necessary. + * TODO: improve performance by doing better when possible. + * This needs help from the server in some efficient way. + */ + maxlen = PAGE_SIZE - (mref->ref_pos & (PAGE_SIZE-1)); + if (mref->ref_len > maxlen) + mref->ref_len = maxlen; + + if (!mref->ref_data) { // buffered IO + struct client_mref_aspect *mref_a = client_mref_get_aspect(output, mref); + if (!mref_a) + return -EILSEQ; + + mref->ref_data = kmalloc(mref->ref_len, GFP_MARS); + if (!mref->ref_data) + return -ENOMEM; + + mref_a->do_dealloc = true; + mref->ref_flags = 0; + } + + atomic_inc(&mref->ref_count); + return 0; +} + +static void client_ref_put(struct client_output *output, struct mref_object *mref) +{ + struct client_mref_aspect *mref_a; + CHECK_ATOMIC(&mref->ref_count, 1); + if (!atomic_dec_and_test(&mref->ref_count)) + return; + mref_a = client_mref_get_aspect(output, mref); + if (mref_a && mref_a->do_dealloc) { + kfree(mref->ref_data); + } + client_free_mref(mref); +} + +static void client_ref_io(struct client_output *output, struct mref_object *mref) +{ + struct generic_callback *cb; + struct client_mref_aspect *mref_a; + unsigned long flags; + int error = -EINVAL; + + mref_a = client_mref_get_aspect(output, mref); + if (unlikely(!mref_a)) { + goto error; + } + + atomic_inc(&mref->ref_count); + + traced_lock(&output->lock, flags); + mref_a->object->ref_id = ++output->last_id; + list_add_tail(&mref_a->io_head, &output->mref_list); + traced_unlock(&output->lock, flags); + + wake_up_interruptible(&output->event); + + return; + +error: + MARS_ERR("IO error = %d\n", error); + cb = mref->ref_cb; + cb->cb_error = error; + cb->cb_fn(cb); + client_ref_put(output, mref); +} + +static int receiver_thread(void *data) +{ + struct client_output *output = data; + int status = 0; + + while (!kthread_should_stop() && output->socket) { + struct mars_cmd cmd = {}; + struct list_head *tmp; + struct client_mref_aspect *mref_a = NULL; + struct mref_object *mref = NULL; + struct generic_callback *cb; + unsigned long flags; + + status = mars_recv_struct(&output->socket, &cmd, mars_cmd_meta); + if (status < 0) + goto done; + + switch (cmd.cmd_code) { + case CMD_CONNECT: + if (cmd.cmd_int1 < 0) { + status = cmd.cmd_int1; + MARS_ERR("remote connect failed, status = %d\n", status); + goto done; + } + break; + case CMD_CB: + traced_lock(&output->lock, flags); + for (tmp = output->wait_list.next; tmp != &output->wait_list; tmp = tmp->next) { + mref_a = container_of(tmp, struct client_mref_aspect, io_head); + if (mref_a->object->ref_id == cmd.cmd_int1) { + mref = mref_a->object; + break; + } + } + traced_unlock(&output->lock, flags); + + if (!mref) { + MARS_ERR("unknown id = %d\n", cmd.cmd_int1); + status = -EBADR; + goto done; + } + + status = mars_recv_cb(&output->socket, mref); + if (status < 0) { + MARS_ERR("interrupted data transfer, status = %d\n", status); + goto done; + } + + traced_lock(&output->lock, flags); + list_del_init(&mref_a->io_head); + traced_unlock(&output->lock, flags); + + cb = mref->ref_cb; + cb->cb_fn(cb); + client_ref_put(output, mref); + break; + case CMD_GETINFO: + status = mars_recv_struct(&output->socket, &output->info, mars_info_meta); + if (status < 0) { + MARS_ERR("got bad info, status = %d\n", status); + goto done; + } + output->got_info = true; + wake_up_interruptible(&output->info_event); + break; + default: + MARS_ERR("got bad command %d, terminating.\n", cmd.cmd_code); + status = -EBADR; + goto done; + } + } + +done: + if (status < 0) + MARS_ERR("receiver thread terminated with status = %d\n", status); + output->receiver.thread = NULL; + if (output->socket) { + MARS_INF("shutting down socket\n"); + kernel_sock_shutdown(output->socket, SHUT_WR); + //msleep(1000); + output->socket = NULL; + } + return status; +} + +static int sender_thread(void *data) +{ + struct client_output *output = data; + struct client_brick *brick = output->brick; + int status = 0; + + while (!kthread_should_stop()) { + struct list_head *tmp; + struct client_mref_aspect *mref_a; + unsigned long flags; + bool do_resubmit = false; + + if (unlikely(!output->socket)) { + status = _connect(output, brick->brick_name); + if (unlikely(status < 0)) { + msleep(5000); + continue; + } + do_resubmit = true; + } + + if (unlikely(!output->receiver.thread)) { + output->receiver.thread = kthread_create(receiver_thread, output, "mars_receiver%d", thread_count++); + if (unlikely(IS_ERR(output->receiver.thread))) { + MARS_ERR("cannot start receiver thread, status = %d\n", (int)PTR_ERR(output->receiver.thread)); + output->receiver.thread = NULL; + msleep(5000); + continue; + } + wake_up_process(output->receiver.thread); + } + + if (do_resubmit) { + /* Re-Submit any waiting requests + */ + traced_lock(&output->lock, flags); + if (!list_empty(&output->wait_list)) { + struct list_head *first = output->wait_list.next; + struct list_head *last = output->wait_list.prev; + struct list_head *old_start = output->mref_list.next; +#define list_connect __list_del // the original routine has a misleading name: in reality it is more general + list_connect(&output->mref_list, first); + list_connect(last, old_start); + INIT_LIST_HEAD(&output->wait_list); + } + traced_unlock(&output->lock, flags); + } + + wait_event_interruptible_timeout(output->event, !list_empty(&output->mref_list), 1 * HZ); + + if (list_empty(&output->mref_list)) + continue; + + traced_lock(&output->lock, flags); + tmp = output->mref_list.next; + list_del(tmp); + list_add(tmp, &output->wait_list); + traced_unlock(&output->lock, flags); + + mref_a = container_of(tmp, struct client_mref_aspect, io_head); + + status = mars_send_mref(&output->socket, mref_a->object); + if (unlikely(status < 0)) { + // retry submission on next occasion.. + traced_lock(&output->lock, flags); + list_del(&mref_a->io_head); + list_add(&mref_a->io_head, &output->mref_list); + traced_unlock(&output->lock, flags); + + MARS_ERR("sending failed, status = %d\n", status); + + _kill_socket(output); + _kill_thread(&output->receiver); + + /* Forcibly mark as dead, in any case. + * In consequence, a new connection will be tried thereafter. + */ + output->receiver.thread = NULL; + continue; + } + } +//done: + if (status < 0) + MARS_ERR("sender thread terminated with status = %d\n", status); + + _kill_socket(output); + _kill_thread(&output->receiver); + + output->sender.thread = NULL; + return status; +} + +static int client_switch(struct client_brick *brick) +{ + struct client_output *output = brick->outputs[0]; + int status = 0; + + if (brick->power.button) { + mars_power_led_off((void*)brick, false); + output->sender.thread = kthread_create(sender_thread, output, "mars_sender%d", thread_count++); + if (unlikely(IS_ERR(output->sender.thread))) { + status = PTR_ERR(output->sender.thread); + MARS_ERR("cannot start sender thread, status = %d\n", status); + output->sender.thread = NULL; + goto done; + } + wake_up_process(output->sender.thread); + mars_power_led_on((void*)brick, true); + } else { + mars_power_led_on((void*)brick, false); + _kill_thread(&output->sender); + mars_power_led_off((void*)brick, !output->sender.thread); + } +done: + return status; +} + + +//////////////// object / aspect constructors / destructors /////////////// + +static int client_mref_aspect_init_fn(struct generic_aspect *_ini, void *_init_data) +{ + struct client_mref_aspect *ini = (void*)_ini; + INIT_LIST_HEAD(&ini->io_head); + return 0; +} + +static void client_mref_aspect_exit_fn(struct generic_aspect *_ini, void *_init_data) +{ + struct client_mref_aspect *ini = (void*)_ini; + (void)ini; +} + +MARS_MAKE_STATICS(client); + +////////////////////// brick constructors / destructors //////////////////// + +static int client_brick_construct(struct client_brick *brick) +{ + return 0; +} + +static int client_output_construct(struct client_output *output) +{ + spin_lock_init(&output->lock); + INIT_LIST_HEAD(&output->mref_list); + INIT_LIST_HEAD(&output->wait_list); + init_waitqueue_head(&output->event); + init_waitqueue_head(&output->sender.event); + init_waitqueue_head(&output->receiver.event); + init_waitqueue_head(&output->info_event); + return 0; +} + +static int client_output_destruct(struct client_output *output) +{ + if (output->host) + kfree(output->host); + return 0; +} + +///////////////////////// static structs //////////////////////// + +static struct client_brick_ops client_brick_ops = { + .brick_switch = client_switch, +}; + +static struct client_output_ops client_output_ops = { + .make_object_layout = client_make_object_layout, + .mars_get_info = client_get_info, + .mref_get = client_ref_get, + .mref_put = client_ref_put, + .mref_io = client_ref_io, +}; + +const struct client_input_type client_input_type = { + .type_name = "client_input", + .input_size = sizeof(struct client_input), +}; + +static const struct client_input_type *client_input_types[] = { + &client_input_type, +}; + +const struct client_output_type client_output_type = { + .type_name = "client_output", + .output_size = sizeof(struct client_output), + .master_ops = &client_output_ops, + .output_construct = &client_output_construct, + .output_destruct = &client_output_destruct, + .aspect_types = client_aspect_types, + .layout_code = { + [BRICK_OBJ_MREF] = LAYOUT_ALL, + } +}; + +static const struct client_output_type *client_output_types[] = { + &client_output_type, +}; + +const struct client_brick_type client_brick_type = { + .type_name = "client_brick", + .brick_size = sizeof(struct client_brick), + .max_inputs = 0, + .max_outputs = 1, + .master_ops = &client_brick_ops, + .default_input_types = client_input_types, + .default_output_types = client_output_types, + .brick_construct = &client_brick_construct, +}; +EXPORT_SYMBOL_GPL(client_brick_type); + +////////////////// module init stuff ///////////////////////// + +static int __init init_client(void) +{ + MARS_INF("init_client()\n"); + return client_register_brick_type(); +} + +static void __exit exit_client(void) +{ + MARS_INF("exit_client()\n"); + client_unregister_brick_type(); +} + +MODULE_DESCRIPTION("MARS client brick"); +MODULE_AUTHOR("Thomas Schoebel-Theuer "); +MODULE_LICENSE("GPL"); + +module_init(init_client); +module_exit(exit_client); diff --git a/mars_client.h b/mars_client.h new file mode 100644 index 00000000..347bd7e7 --- /dev/null +++ b/mars_client.h @@ -0,0 +1,45 @@ +// (c) 2010 Thomas Schoebel-Theuer / 1&1 Internet AG +#ifndef MARS_CLIENT_H +#define MARS_CLIENT_H + +#include "mars_net.h" + +struct client_mref_aspect { + GENERIC_ASPECT(mref); + struct list_head io_head; + bool do_dealloc; +}; + +struct client_brick { + MARS_BRICK(client); +}; + +struct client_input { + MARS_INPUT(client); +}; + +struct client_threadinfo { + struct task_struct *thread; + wait_queue_head_t event; +}; + +struct client_output { + MARS_OUTPUT(client); + spinlock_t lock; + struct list_head mref_list; + struct list_head wait_list; + wait_queue_head_t event; + int last_id; + struct socket *socket; + char *host; + char *path; + struct client_threadinfo sender; + struct client_threadinfo receiver; + struct mars_info info; + wait_queue_head_t info_event; + bool got_info; +}; + +MARS_TYPES(client); + +#endif diff --git a/mars_copy.c b/mars_copy.c new file mode 100644 index 00000000..dcf3605a --- /dev/null +++ b/mars_copy.c @@ -0,0 +1,572 @@ +// (c) 2010 Thomas Schoebel-Theuer / 1&1 Internet AG + +// Copy brick (just for demonstration) + +//#define BRICK_DEBUGGING +//#define MARS_DEBUGGING +//#define IO_DEBUGGING + +#include +#include +#include +#include + +#include "mars.h" + +///////////////////////// own type definitions //////////////////////// + +#include "mars_copy.h" + +///////////////////////// own helper functions //////////////////////// + +/* TODO: + * The clash logic is untested / alpha stage (Feb. 2011). + * + * For now, the output is never used, so this cannot do harm. + * + * In order to get the output really working / enterprise grade, + * some larger test effort should be invested. + */ +static inline +void _clash(struct copy_brick *brick) +{ + brick->trigger = true; + set_bit(0, &brick->clash); + wake_up_interruptible(&brick->event); +} + +static inline +int _clear_clash(struct copy_brick *brick) +{ + int old; + old = test_and_clear_bit(0, &brick->clash); + return old; +} + +/* Current semantics: + * + * All writes are always going to the original input A. They are _not_ + * replicated to B. + * + * In order to get B really uptodate, you have to replay the right + * transaction logs there (at the right time). + * [If you had no writes on A at all during the copy, of course + * this is not necessary] + * + * When optimize_mode is on, reads can utilize the already copied + * region from B, but only as long as this region has not been + * invalidated by writes (indicated by low_dirty). + * + * TODO: implement replicated writes, together with some transaction + * replay logic applying the transaction logs _only_ after + * crashes during inconsistency caused by partial replication of writes. + */ +static +int _determine_input(struct copy_brick *brick, struct mref_object *mref) +{ + int rw; + int below; + int behind; + loff_t ref_end; + + if (!brick->optimize_mode || brick->low_dirty) + return INPUT_A_IO; + + ref_end = mref->ref_pos + mref->ref_len; + below = ref_end <= brick->copy_start; + behind = !brick->copy_end || mref->ref_pos >= brick->copy_end; + rw = mref->ref_may_write | mref->ref_rw; + if (rw) { + if (!behind) { + brick->low_dirty = true; + if (!below) { + _clash(brick); + wake_up_interruptible(&brick->event); + } + } + return INPUT_A_IO; + } + + if (below) + return INPUT_B_IO; + + return INPUT_A_IO; +} + +#define MAKE_INDEX(pos) (((pos) / PAGE_SIZE) % MAX_COPY_PARA) + +static +void copy_endio(struct generic_callback *cb) +{ + struct copy_mref_aspect *mref_a; + struct mref_object *mref; + struct copy_brick *brick; + int index; + int queue; + + mref_a = cb->cb_private; + CHECK_PTR(mref_a, err); + mref = mref_a->object; + CHECK_PTR(mref, err); + brick = mref_a->brick; + CHECK_PTR(brick, err); + + queue = mref_a->queue; + index = MAKE_INDEX(mref->ref_pos); + MARS_IO("queue = %d index = %d pos = %lld status = %d\n", queue, index, mref->ref_pos, cb->cb_error); + if (unlikely(queue < 0 || queue >= 2)) { + MARS_ERR("bad queue %d\n", queue); + _clash(brick); + goto exit; + } + if (unlikely(brick->table[index][queue])) { + MARS_ERR("table corruption at %d %d (%p => %p)\n", index, queue, brick->table[index], mref); + _clash(brick); + brick->state[index] = -EINVAL; + goto exit; + } + if (unlikely(cb->cb_error < 0)) { + MARS_ERR("IO error %d on index %d, old state =%d\n", cb->cb_error, index, brick->state[index]); + brick->state[index] = cb->cb_error; + } else if (likely(brick->state[index] > 0)) { + brick->table[index][queue] = mref; + } + +exit: + atomic_dec(&brick->copy_flight); + brick->trigger = true; + wake_up_interruptible(&brick->event); + return; + +err: + MARS_FAT("cannot handle callback\n"); +} + +static +int _make_mref(struct copy_brick *brick, int index, int queue, void *data, loff_t pos, int rw) +{ + struct mref_object *mref; + struct copy_mref_aspect *mref_a; + struct copy_input *input; + loff_t tmp_pos; + int len; + int status = -1; + + tmp_pos = brick->copy_end; + if (brick->clash || !tmp_pos) + goto done; + + mref = copy_alloc_mref(brick->outputs[0], &brick->mref_object_layout); + status = -ENOMEM; + if (unlikely(!mref)) + goto done; + + mref_a = copy_mref_get_aspect(brick->outputs[0], mref); + if (unlikely(!mref_a)) { + kfree(mref); + goto done; + } + + mref_a->brick = brick; + mref_a->queue = queue; + mref->ref_may_write = rw; + mref->ref_rw = rw; + mref->ref_data = data; + mref->ref_pos = pos; + len = PAGE_SIZE - (pos & (PAGE_SIZE-1)); + if (pos + len > tmp_pos) { + len = tmp_pos - pos; + } + mref->ref_len = len; + mref->_ref_cb.cb_private = mref_a; + mref->_ref_cb.cb_fn = copy_endio; + mref->ref_cb = &mref->_ref_cb; + + input = queue ? brick->inputs[INPUT_B_COPY] : brick->inputs[INPUT_A_COPY]; + status = GENERIC_INPUT_CALL(input, mref_get, mref); + if (unlikely(status < 0)) { + MARS_ERR("status = %d\n", status); + mars_free_mref(mref); + goto done; + } + + MARS_IO("queue = %d index = %d pos = %lld len = %d rw = %d\n", queue, index, mref->ref_pos, mref->ref_len, rw); + + atomic_inc(&brick->copy_flight); + GENERIC_INPUT_CALL(input, mref_io, mref); + +done: + return status; +} + +static +void _clear_mref(struct copy_brick *brick, int index, int queue) +{ + struct mref_object *mref = brick->table[index][queue]; + if (mref) { + struct copy_input *input; + input = queue ? brick->inputs[INPUT_B_COPY] : brick->inputs[INPUT_A_COPY]; + GENERIC_INPUT_CALL(input, mref_put, mref); + brick->table[index][queue] = NULL; + } +} + +static +int _next_state(struct copy_brick *brick, loff_t pos) +{ + struct mref_object *mref1; + struct mref_object *mref2; + int index = MAKE_INDEX(pos); + char state; + char next_state; + int i; + int status; + + state = brick->state[index]; + next_state = -1; + mref2 = NULL; + status = 0; + + MARS_IO("index = %d state = %d pos = %lld\n", index, state, pos); + + switch (state) { + case COPY_STATE_START: + if (brick->table[index][0] || brick->table[index][1]) { + MARS_ERR("index %d not startable\n", index); + status = -EPROTO; + goto done; + } + i = 0; + next_state = COPY_STATE_READ1; + if (brick->verify_mode) { + i = 1; + next_state = COPY_STATE_READ2; + } + for ( ; i >= 0; i--) { + status = _make_mref(brick, index, i, NULL, pos, 0); + if (status < 0) { + break; + } + } + break; + case COPY_STATE_READ2: + mref2 = brick->table[index][1]; + if (!mref2) { + goto done; + } + /* fallthrough */ + case COPY_STATE_READ1: + mref1 = brick->table[index][0]; + if (!mref1) { + goto done; + } + if (mref2) { + int len = mref1->ref_len; + if (len == mref2->ref_len && + !memcmp(mref1->ref_data, mref2->ref_data, len)) { + /* skip start of writing, goto final treatment of writeout */ + next_state = COPY_STATE_WRITE; + brick->state[index] = next_state; + goto COPY_STATE_WRITE; + } + _clear_mref(brick, index, 1); + } + /* start writeout */ + next_state = COPY_STATE_WRITE; + status = _make_mref(brick, index, 1, mref1->ref_data, pos, 1); + + break; + case COPY_STATE_WRITE: + COPY_STATE_WRITE: + mref2 = brick->table[index][1]; + if (!mref2 || brick->copy_start != pos) { + MARS_IO("irrelevant\n"); + goto done; + } + if (!brick->clash) { + brick->copy_start += mref2->ref_len; + MARS_IO("new copy_start = %lld\n", brick->copy_start); + if (brick->copy_start > brick->copy_last + 1024 * 1024 * 1024 || brick->copy_start == brick->copy_end) { + brick->copy_last = brick->copy_start; + MARS_INF("'%s' copied %lld / %lld bytes (%lld%%)\n", brick->brick_name, brick->copy_last, brick->copy_end, brick->copy_end? brick->copy_last * 100 / brick->copy_end : 100); + } + } + next_state = COPY_STATE_CLEANUP; + /* fallthrough */ + case COPY_STATE_CLEANUP: + _clear_mref(brick, index, 0); + _clear_mref(brick, index, 1); + next_state = COPY_STATE_START; + break; + default: + MARS_ERR("illegal state %d at index %d\n", state, index); + _clash(brick); + status = -EILSEQ; + } + + brick->state[index] = next_state; + if (status < 0) { + brick->state[index] = -1; + MARS_ERR("status = %d\n", status); + _clash(brick); + } + +done: + return status; +} + +static +void _run_copy(struct copy_brick *brick) +{ + int max; + loff_t pos; + int i; + int status; + + if (_clear_clash(brick)) { + MARS_DBG("clash\n"); + if (atomic_read(&brick->copy_flight)) { + /* wait until all pending copy IO has finished + */ + _clash(brick); + MARS_DBG("re-clash\n"); + msleep(50); + return; + } + for (i = 0; i < MAX_COPY_PARA; i++) { + brick->table[i][0] = NULL; + brick->table[i][1] = NULL; + brick->state[i] = COPY_STATE_START; + } + } + + max = MAX_COPY_PARA - atomic_read(&brick->io_flight) * 2; + MARS_IO("max = %d\n", max); + + for (pos = brick->copy_start; pos < brick->copy_end; pos = ((pos / PAGE_SIZE) + 1) * PAGE_SIZE) { + //MARS_IO("pos = %lld\n", pos); + if (brick->clash || max-- <= 0) + break; + status = _next_state(brick, pos); + } +} + +static int _copy_thread(void *data) +{ + struct copy_brick *brick = data; + + MARS_DBG("--------------- copy_thread %p starting\n", brick); + mars_power_led_on((void*)brick, true); + brick->trigger = true; + + while (!kthread_should_stop()) { + loff_t old_start = brick->copy_start; + loff_t old_end = brick->copy_end; + if (old_end > 0) + _run_copy(brick); + + wait_event_interruptible_timeout(brick->event, + brick->trigger || brick->copy_start != old_start || brick->copy_end != old_end || kthread_should_stop(), + + 20 * HZ); + brick->trigger = false; + } + + MARS_DBG("--------------- copy_thread terminating\n"); + wait_event_interruptible_timeout(brick->event, !atomic_read(&brick->copy_flight), 300 * HZ); + mars_power_led_off((void*)brick, true); + MARS_DBG("--------------- copy_thread done.\n"); + return 0; +} + +////////////////// own brick / input / output operations ////////////////// + +static int copy_get_info(struct copy_output *output, struct mars_info *info) +{ + struct copy_input *input = output->brick->inputs[INPUT_B_IO]; + return GENERIC_INPUT_CALL(input, mars_get_info, info); +} + +static int copy_ref_get(struct copy_output *output, struct mref_object *mref) +{ + struct copy_input *input; + int index; + int status; + index = _determine_input(output->brick, mref); + input = output->brick->inputs[index]; + status = GENERIC_INPUT_CALL(input, mref_get, mref); + if (status >= 0) { + atomic_inc(&output->brick->io_flight); + } + return status; +} + +static void copy_ref_put(struct copy_output *output, struct mref_object *mref) +{ + struct copy_input *input; + int index; + index = _determine_input(output->brick, mref); + input = output->brick->inputs[index]; + GENERIC_INPUT_CALL(input, mref_put, mref); + if (atomic_dec_and_test(&output->brick->io_flight)) { + output->brick->trigger = true; + wake_up_interruptible(&output->brick->event); + } +} + +static void copy_ref_io(struct copy_output *output, struct mref_object *mref) +{ + struct copy_input *input; + int index; + index = _determine_input(output->brick, mref); + input = output->brick->inputs[index]; + GENERIC_INPUT_CALL(input, mref_io, mref); +} + +static int copy_switch(struct copy_brick *brick) +{ + static int version = 0; + + MARS_DBG("power.button = %d\n", brick->power.button); + if (brick->power.button) { + mars_power_led_off((void*)brick, false); + if (!brick->thread) { + brick->thread = kthread_create(_copy_thread, brick, "mars_copy%d", version++); + if (brick->thread) { + get_task_struct(brick->thread); + brick->trigger = true; + wake_up_process(brick->thread); + } else { + mars_power_led_off((void*)brick, true); + MARS_ERR("could not start copy thread\n"); + } + } + } else { + mars_power_led_on((void*)brick, false); + if (brick->thread) { + kthread_stop_nowait(brick->thread); + put_task_struct(brick->thread); + brick->thread = NULL; + wake_up_interruptible(&brick->event); + } + } + return 0; +} + + +//////////////// object / aspect constructors / destructors /////////////// + +static int copy_mref_aspect_init_fn(struct generic_aspect *_ini, void *_init_data) +{ + struct copy_mref_aspect *ini = (void*)_ini; + (void)ini; + return 0; +} + +static void copy_mref_aspect_exit_fn(struct generic_aspect *_ini, void *_init_data) +{ + struct copy_mref_aspect *ini = (void*)_ini; + (void)ini; +} + +MARS_MAKE_STATICS(copy); + +////////////////////// brick constructors / destructors //////////////////// + +static int copy_brick_construct(struct copy_brick *brick) +{ + init_waitqueue_head(&brick->event); + sema_init(&brick->mutex, 1); + return 0; +} + +static int copy_brick_destruct(struct copy_brick *brick) +{ + return 0; +} + +static int copy_output_construct(struct copy_output *output) +{ + return 0; +} + +static int copy_output_destruct(struct copy_output *output) +{ + return 0; +} + +///////////////////////// static structs //////////////////////// + +static struct copy_brick_ops copy_brick_ops = { + .brick_switch = copy_switch, +}; + +static struct copy_output_ops copy_output_ops = { + .make_object_layout = copy_make_object_layout, + .mars_get_info = copy_get_info, + .mref_get = copy_ref_get, + .mref_put = copy_ref_put, + .mref_io = copy_ref_io, +}; + +const struct copy_input_type copy_input_type = { + .type_name = "copy_input", + .input_size = sizeof(struct copy_input), +}; + +static const struct copy_input_type *copy_input_types[] = { + ©_input_type, + ©_input_type, + ©_input_type, + ©_input_type, +}; + +const struct copy_output_type copy_output_type = { + .type_name = "copy_output", + .output_size = sizeof(struct copy_output), + .master_ops = ©_output_ops, + .output_construct = ©_output_construct, + .output_destruct = ©_output_destruct, + .aspect_types = copy_aspect_types, + .layout_code = { + [BRICK_OBJ_MREF] = LAYOUT_ALL, + } +}; + +static const struct copy_output_type *copy_output_types[] = { + ©_output_type, +}; + +const struct copy_brick_type copy_brick_type = { + .type_name = "copy_brick", + .brick_size = sizeof(struct copy_brick), + .max_inputs = 4, + .max_outputs = 1, + .master_ops = ©_brick_ops, + .default_input_types = copy_input_types, + .default_output_types = copy_output_types, + .brick_construct = ©_brick_construct, + .brick_destruct = ©_brick_destruct, +}; +EXPORT_SYMBOL_GPL(copy_brick_type); + +////////////////// module init stuff ///////////////////////// + +static int __init init_copy(void) +{ + MARS_INF("init_copy()\n"); + return copy_register_brick_type(); +} + +static void __exit exit_copy(void) +{ + MARS_INF("exit_copy()\n"); + copy_unregister_brick_type(); +} + +MODULE_DESCRIPTION("MARS copy brick"); +MODULE_AUTHOR("Thomas Schoebel-Theuer "); +MODULE_LICENSE("GPL"); + +module_init(init_copy); +module_exit(exit_copy); diff --git a/mars_copy.h b/mars_copy.h new file mode 100644 index 00000000..7119f654 --- /dev/null +++ b/mars_copy.h @@ -0,0 +1,62 @@ +// (c) 2010 Thomas Schoebel-Theuer / 1&1 Internet AG +#ifndef MARS_COPY_H +#define MARS_COPY_H + +#include +#include + +#define INPUT_A_IO 0 +#define INPUT_A_COPY 1 +#define INPUT_B_IO 2 +#define INPUT_B_COPY 3 + +#define MAX_COPY_PARA 512 + +enum { + COPY_STATE_START = 0, + COPY_STATE_READ1 = 1, + COPY_STATE_READ2 = 2, + COPY_STATE_WRITE, + COPY_STATE_CLEANUP, +}; + +struct copy_mref_aspect { + GENERIC_ASPECT(mref); + struct copy_brick *brick; + int queue; +}; + +struct copy_brick { + MARS_BRICK(copy); + // parameters + volatile loff_t copy_start; + volatile loff_t copy_end; // stop working if == 0 + loff_t copy_last; + bool verify_mode; + bool optimize_mode; + bool low_dirty; + // internal + volatile bool trigger; + volatile unsigned long clash; + atomic_t io_flight; + atomic_t copy_flight; + + wait_queue_head_t event; + struct semaphore mutex; + struct task_struct *thread; + char state[MAX_COPY_PARA]; + struct mref_object *table[MAX_COPY_PARA][2]; + struct generic_object_layout mref_object_layout; +}; + +struct copy_input { + MARS_INPUT(copy); +}; + +struct copy_output { + MARS_OUTPUT(copy); +}; + +MARS_TYPES(copy); + +#endif diff --git a/mars_device_aio.c b/mars_device_aio.c index 4849e3ea..614797f7 100644 --- a/mars_device_aio.c +++ b/mars_device_aio.c @@ -2,7 +2,7 @@ //#define BRICK_DEBUGGING //#define MARS_DEBUGGING -//#define LOG +//#define IO_DEBUGGING #include #include @@ -44,7 +44,7 @@ static int device_aio_ref_get(struct device_aio_output *output, struct mref_obje return -ENOMEM; mref->ref_flags = 0; mref_a->do_dealloc = true; -#if 1 // litter flags for testing +#if 0 // litter flags for testing if (mref->ref_rw) { static int random = 0; if (!(random++ % 2)) @@ -84,16 +84,14 @@ static void device_aio_ref_io(struct device_aio_output *output, struct mref_obje goto done; } -#ifdef LOG - MARS_INF("AIO rw=%d pos=%lld len=%d data=%p\n", mref->ref_rw, mref->ref_pos, mref->ref_len, mref->ref_data); -#endif + MARS_IO("AIO rw=%d pos=%lld len=%d data=%p\n", mref->ref_rw, mref->ref_pos, mref->ref_len, mref->ref_data); mref_a = device_aio_mref_get_aspect(output, mref); traced_lock(&tinfo->lock, flags); list_add_tail(&mref_a->io_head, &tinfo->mref_list); traced_unlock(&tinfo->lock, flags); - wake_up(&tinfo->event); + wake_up_interruptible(&tinfo->event); return; done: @@ -129,6 +127,22 @@ static int device_aio_submit(struct device_aio_output *output, struct device_aio return res; } +static int device_aio_submit_dummy(struct device_aio_output *output) +{ + mm_segment_t oldfs; + int res; + struct iocb iocb = { + }; + struct iocb *iocbp = &iocb; + + oldfs = get_fs(); + set_fs(get_ds()); + res = sys_io_submit(output->ctxp, 1, &iocbp); + set_fs(oldfs); + + return res; +} + static int device_aio_submit_thread(void *data) { struct aio_threadinfo *tinfo = data; @@ -137,7 +151,7 @@ static int device_aio_submit_thread(void *data) /* TODO: this is provisionary. We only need it for sys_io_submit(). * The latter should be accompanied by a future vfs_submit() or - * do_sumbmit() which currently does not exist :( + * do_submit() which currently does not exist :( * FIXME: corresponding cleanup NYI */ err = get_unused_fd(); @@ -150,11 +164,15 @@ static int device_aio_submit_thread(void *data) MARS_INF("kthread has started.\n"); //set_user_nice(current, -20); +#if 0 + fake_mm(); +#else MARS_INF("old mm = %p\n", current->mm); use_mm(tinfo->mm); MARS_INF("new mm = %p\n", current->mm); if (!current->mm) return 0; +#endif while (!kthread_should_stop()) { struct list_head *tmp = NULL; @@ -205,6 +223,7 @@ static int device_aio_submit_thread(void *data) unuse_mm(tinfo->mm); MARS_INF("kthread has stopped.\n"); + tinfo->terminated = true; return 0; } @@ -229,12 +248,15 @@ static int device_aio_event_thread(void *data) int bounced; int i; struct timespec timeout = { - .tv_sec = 30, + .tv_sec = 10, }; struct io_event events[MARS_MAX_AIO_READ]; oldfs = get_fs(); set_fs(get_ds()); + /* TODO: don't timeout upon termination. + * Probably we should submit a dummy request. + */ count = sys_io_getevents(output->ctxp, 1, MARS_MAX_AIO_READ, events, &timeout); set_fs(oldfs); @@ -242,13 +264,17 @@ static int device_aio_event_thread(void *data) bounced = 0; for (i = 0; i < count; i++) { struct device_aio_mref_aspect *mref_a = (void*)events[i].data; - struct mref_object *mref = mref_a->object; - struct generic_callback *cb = mref->ref_cb; + struct mref_object *mref; + struct generic_callback *cb; int err = events[i].res; -#ifdef LOG - MARS_INF("AIO done %p pos = %lld len = %d rw = %d\n", mref, mref->ref_pos, mref->ref_len, mref->ref_rw); -#endif + if (!mref_a) { + continue; // this was a dummy request + } + mref = mref_a->object; + cb = mref->ref_cb; + + MARS_IO("AIO done %p pos = %lld len = %d rw = %d\n", mref, mref->ref_pos, mref->ref_len, mref->ref_rw); if (output->o_fdsync && err >= 0 @@ -277,12 +303,13 @@ static int device_aio_event_thread(void *data) device_aio_ref_put(output, mref); } if (bounced) - wake_up(&other->event); + wake_up_interruptible(&other->event); } unuse_mm(tinfo->mm); MARS_INF("kthread has stopped.\n"); + tinfo->terminated = true; return 0; } @@ -294,7 +321,7 @@ static int device_aio_sync_thread(void *data) struct device_aio_output *output = tinfo->output; struct file *file = output->filp; - MARS_INF("kthread has started.\n"); + MARS_INF("kthread has started on '%s'.\n", output->brick->brick_name); //set_user_nice(current, -20); while (!kthread_should_stop()) { @@ -339,13 +366,18 @@ static int device_aio_sync_thread(void *data) } MARS_INF("kthread has stopped.\n"); + tinfo->terminated = true; return 0; } static int device_aio_get_info(struct device_aio_output *output, struct mars_info *info) { struct file *file = output->filp; + if (unlikely(!file || !file->f_mapping || !file->f_mapping->host)) + return -EINVAL; + info->current_size = i_size_read(file->f_mapping->host); + MARS_DBG("determined file size = %lld\n", info->current_size); info->backing_file = file; return 0; } @@ -374,23 +406,27 @@ static int device_aio_brick_construct(struct device_aio_brick *brick) return 0; } -static int device_aio_switch(struct device_aio_brick *brick, bool state) +static int device_aio_switch(struct device_aio_brick *brick) { static int index = 0; struct device_aio_output *output = brick->outputs[0]; - char *path = output->output_name; + const char *path = output->output_name; int flags = O_CREAT | O_RDWR | O_LARGEFILE; int prot = 0600; mm_segment_t oldfs; int i; int err = 0; + MARS_DBG("power.button = %d\n", brick->power.button); + if (!brick->power.button) + goto cleanup; + + mars_power_led_off((void*)brick, false); + if (output->o_direct) { flags |= O_DIRECT; MARS_INF("using O_DIRECT on %s\n", path); } - if (!state) - goto cleanup; oldfs = get_fs(); set_fs(get_ds()); @@ -403,9 +439,14 @@ static int device_aio_switch(struct device_aio_brick *brick, bool state) output->filp = NULL; return err; } + MARS_DBG("opened file '%s'\n", path); if (!output->ctxp) { - MARS_INF("mm = %p\n", current->mm); + if (!current->mm) { + MARS_ERR("mm = %p\n", current->mm); + err = -EINVAL; + goto err; + } oldfs = get_fs(); set_fs(get_ds()); err = sys_io_setup(MARS_MAX_AIO, &output->ctxp); @@ -426,6 +467,7 @@ static int device_aio_switch(struct device_aio_brick *brick, bool state) tinfo->mm = current->mm; spin_lock_init(&tinfo->lock); init_waitqueue_head(&tinfo->event); + tinfo->terminated = false; tinfo->thread = kthread_create(fn[i], tinfo, "mars_aio%d", index++); if (IS_ERR(tinfo->thread)) { err = PTR_ERR(tinfo->thread); @@ -437,26 +479,50 @@ static int device_aio_switch(struct device_aio_brick *brick, bool state) } MARS_INF("opened file '%s'\n", path); + mars_power_led_on((void*)brick, true); + MARS_DBG("successfully switched on.\n"); return 0; err: MARS_ERR("status = %d\n", err); cleanup: - for (i = 0; i < 2; i++) { + mars_power_led_on((void*)brick, false); + for (i = 0; i < 3; i++) { struct aio_threadinfo *tinfo = &output->tinfo[i]; if (tinfo->thread) { kthread_stop(tinfo->thread); - // FIXME: wait for termination tinfo->thread = NULL; } } - if (output->ctxp) { - //... + device_aio_submit_dummy(output); + for (i = 0; i < 3; i++) { + struct aio_threadinfo *tinfo = &output->tinfo[i]; + if (tinfo->thread) { + // wait for termination + wait_event_interruptible_timeout( + tinfo->event, + tinfo->terminated, 30 * HZ); + if (tinfo->terminated) + tinfo->thread = NULL; + } } - if (output->filp) { - filp_close(output->filp, NULL); - output->filp = NULL; + mars_power_led_off((void*)brick, + (output->tinfo[0].thread == NULL && + output->tinfo[1].thread == NULL && + output->tinfo[2].thread == NULL)); + if (brick->power.led_off) { + if (output->filp) { + filp_close(output->filp, NULL); + output->filp = NULL; + } + if (output->ctxp) { +#if 0 // FIXME this crashes + sys_io_destroy(output->ctxp); +#endif + output->ctxp = 0; + } } + MARS_DBG("switch off status = %d\n", err); return err; } @@ -467,7 +533,8 @@ static int device_aio_output_construct(struct device_aio_output *output) static int device_aio_output_destruct(struct device_aio_output *output) { - return device_aio_switch(output->brick, false); + mars_power_button((void*)output->brick, false); + return device_aio_switch(output->brick); } ///////////////////////// static structs //////////////////////// diff --git a/mars_device_aio.h b/mars_device_aio.h index 9bdde63d..0c827273 100644 --- a/mars_device_aio.h +++ b/mars_device_aio.h @@ -27,6 +27,7 @@ struct aio_threadinfo { struct mm_struct *mm; wait_queue_head_t event; spinlock_t lock; + bool terminated; }; struct device_aio_output { diff --git a/mars_device_sio.c b/mars_device_sio.c index 2cc388f7..90535b12 100644 --- a/mars_device_sio.c +++ b/mars_device_sio.c @@ -282,8 +282,6 @@ static void device_sio_mars_queue(struct device_sio_output *output, struct mref_ struct generic_callback *cb = mref->ref_cb; unsigned long flags; - atomic_inc(&mref->ref_count); - if (mref->ref_rw == READ) { traced_lock(&output->g_lock, flags); index = output->index++; @@ -297,6 +295,9 @@ static void device_sio_mars_queue(struct device_sio_output *output, struct mref_ cb->cb_fn(cb); return; } + + atomic_inc(&mref->ref_count); + tinfo = &output->tinfo[index]; MARS_DBG("queueing %p on %d\n", mref, index); @@ -304,7 +305,7 @@ static void device_sio_mars_queue(struct device_sio_output *output, struct mref_ list_add_tail(&mref_a->io_head, &tinfo->mref_list); traced_unlock(&tinfo->lock, flags); - wake_up(&tinfo->event); + wake_up_interruptible(&tinfo->event); } static int device_sio_thread(void *data) @@ -407,10 +408,10 @@ static int device_sio_brick_construct(struct device_sio_brick *brick) return 0; } -static int device_sio_switch(struct device_sio_brick *brick, bool state) +static int device_sio_switch(struct device_sio_brick *brick) { struct device_sio_output *output = brick->outputs[0]; - char *path = output->output_name; + const char *path = output->output_name; int flags = O_CREAT | O_RDWR | O_LARGEFILE; int prot = 0600; mm_segment_t oldfs; @@ -419,7 +420,8 @@ static int device_sio_switch(struct device_sio_brick *brick, bool state) flags |= O_DIRECT; MARS_INF("using O_DIRECT on %s\n", path); } - if (state) { + if (brick->power.button) { + mars_power_led_off((void*)brick, false); oldfs = get_fs(); set_fs(get_ds()); output->filp = filp_open(path, flags, prot); @@ -439,8 +441,11 @@ static int device_sio_switch(struct device_sio_brick *brick, bool state) } #endif MARS_INF("opened file '%s'\n", path); + mars_power_led_on((void*)brick, true); } else { + mars_power_led_on((void*)brick, false); // TODO: close etc... + mars_power_led_off((void*)brick, true); } return 0; } diff --git a/mars_dummy.c b/mars_dummy.c index 716fc6d8..85a79956 100644 --- a/mars_dummy.c +++ b/mars_dummy.c @@ -43,13 +43,28 @@ static void dummy_ref_io(struct dummy_output *output, struct mref_object *mref) GENERIC_INPUT_CALL(input, mref_io, mref); } +static int dummy_switch(struct dummy_brick *brick) +{ + if (brick->power.button) { + mars_power_led_off((void*)brick, false); + //... + mars_power_led_on((void*)brick, true); + } else { + mars_power_led_on((void*)brick, false); + //... + mars_power_led_off((void*)brick, true); + } + return 0; +} + + //////////////// object / aspect constructors / destructors /////////////// static int dummy_mref_aspect_init_fn(struct generic_aspect *_ini, void *_init_data) { struct dummy_mref_aspect *ini = (void*)_ini; (void)ini; - ini->my_own = 0; + //ini->my_own = 0; return 0; } @@ -65,19 +80,30 @@ MARS_MAKE_STATICS(dummy); static int dummy_brick_construct(struct dummy_brick *brick) { - brick->my_own = 0; + //brick->my_own = 0; + return 0; +} + +static int dummy_brick_destruct(struct dummy_brick *brick) +{ return 0; } static int dummy_output_construct(struct dummy_output *output) { - output->my_own = 0; + //output->my_own = 0; + return 0; +} + +static int dummy_output_destruct(struct dummy_output *output) +{ return 0; } ///////////////////////// static structs //////////////////////// static struct dummy_brick_ops dummy_brick_ops = { + .brick_switch = dummy_switch, }; static struct dummy_output_ops dummy_output_ops = { @@ -102,6 +128,7 @@ const struct dummy_output_type dummy_output_type = { .output_size = sizeof(struct dummy_output), .master_ops = &dummy_output_ops, .output_construct = &dummy_output_construct, + .output_destruct = &dummy_output_destruct, .aspect_types = dummy_aspect_types, .layout_code = { [BRICK_OBJ_MREF] = LAYOUT_ALL, @@ -121,6 +148,7 @@ const struct dummy_brick_type dummy_brick_type = { .default_input_types = dummy_input_types, .default_output_types = dummy_output_types, .brick_construct = &dummy_brick_construct, + .brick_destruct = &dummy_brick_destruct, }; EXPORT_SYMBOL_GPL(dummy_brick_type); @@ -128,13 +156,13 @@ EXPORT_SYMBOL_GPL(dummy_brick_type); static int __init init_dummy(void) { - printk(MARS_INFO "init_dummy()\n"); + MARS_INF("init_dummy()\n"); return dummy_register_brick_type(); } static void __exit exit_dummy(void) { - printk(MARS_INFO "exit_dummy()\n"); + MARS_INF("exit_dummy()\n"); dummy_unregister_brick_type(); } diff --git a/mars_generic.c b/mars_generic.c index 72a97e61..1a902270 100644 --- a/mars_generic.c +++ b/mars_generic.c @@ -1,12 +1,77 @@ // (c) 2010 Thomas Schoebel-Theuer / 1&1 Internet AG +//#define BRICK_DEBUGGING +//#define MARS_DEBUGGING + #include #include #include +#include +#include +#include +#include #define _STRATEGY #include "mars.h" +#include +#include +#include + +// some helpers +int mars_mkdir(const char *path) +{ + mm_segment_t oldfs; + int status; + + oldfs = get_fs(); + set_fs(get_ds()); + status = sys_mkdir(path, 0700); + set_fs(oldfs); + + return status; +} +EXPORT_SYMBOL_GPL(mars_mkdir); + +int mars_symlink(const char *oldpath, const char *newpath, const struct timespec *stamp) +{ + int newlen = strlen(newpath); + char tmp[newlen + 16]; + mm_segment_t oldfs; + int status; + + snprintf(tmp, sizeof(tmp), "%s.tmp", newpath); + oldfs = get_fs(); + set_fs(get_ds()); + (void)sys_unlink(tmp); + status = sys_symlink(oldpath, tmp); + set_fs(oldfs); + + // TODO NYI: set timestamp + + if (status >= 0) { + status = mars_rename(tmp, newpath); + } + + return status; +} +EXPORT_SYMBOL_GPL(mars_symlink); + +int mars_rename(const char *oldpath, const char *newpath) +{ + mm_segment_t oldfs; + int status; + + oldfs = get_fs(); + set_fs(get_ds()); + status = sys_rename(oldpath, newpath); + set_fs(oldfs); + + return status; +} +EXPORT_SYMBOL_GPL(mars_rename); + + ////////////////////////////////////////////////////////////// // object stuff @@ -22,18 +87,615 @@ EXPORT_SYMBOL_GPL(mref_type); // brick stuff +////////////////////////////////////////////////////////////// + +// infrastructure + +static char *id = NULL; + +/* TODO: better use MAC addresses (or motherboard IDs where available). + * Or, at least, some checks for MAC addresses should be recorded / added. + * When the nodename is misconfigured, data might be scrambled. + * MAC addresses should be more secure. + * In ideal case, further checks should be added to prohibit accidental + * name clashes. + */ +char *my_id(void) +{ + struct new_utsname *u; + if (id) + return id; + + //down_read(&uts_sem); // FIXME: this is currenty not EXPORTed from the kernel! + u = utsname(); + if (u) { + id = kstrdup(u->nodename, GFP_MARS); + } + //up_read(&uts_sem); + + return id; +} +EXPORT_SYMBOL_GPL(my_id); + +struct mars_global *mars_global = NULL; +EXPORT_SYMBOL_GPL(mars_global); + +void mars_trigger(void) +{ + if (mars_global) { + MARS_DBG("trigger...\n"); + mars_global->main_trigger = true; + wake_up_interruptible(&mars_global->main_event); + } +} +EXPORT_SYMBOL_GPL(mars_trigger); + +void mars_power_button(struct mars_brick *brick, bool val) +{ + bool oldval = brick->power.button; + if (val != oldval) { + MARS_DBG("brick '%s' type '%s' power button %d -> %d\n", brick->brick_path, brick->type->type_name, oldval, val); + set_button(&brick->power, val); + mars_trigger(); + } +} +EXPORT_SYMBOL_GPL(mars_power_button); + +void mars_power_led_on(struct mars_brick *brick, bool val) +{ + bool oldval = brick->power.led_on; + if (val != oldval) { + MARS_DBG("brick '%s' type '%s' led_on %d -> %d\n", brick->brick_path, brick->type->type_name, oldval, val); + set_led_on(&brick->power, val); + mars_trigger(); + } +} +EXPORT_SYMBOL_GPL(mars_power_led_on); + +void mars_power_led_off(struct mars_brick *brick, bool val) +{ + bool oldval = brick->power.led_off; + if (val != oldval) { + MARS_DBG("brick '%s' type '%s' led_off %d -> %d\n", brick->brick_path, brick->type->type_name, oldval, val); + set_led_off(&brick->power, val); + mars_trigger(); + } +} +EXPORT_SYMBOL_GPL(mars_power_led_off); ///////////////////////////////////////////////////////////////////// +// strategy layer + + +struct mars_cookie { + struct mars_global *global; + mars_dent_checker checker; + char *path; + void *parent; + int pathlen; + int allocsize; + int depth; +}; + +static +int get_inode(char *newpath, struct mars_dent *dent) +{ + mm_segment_t oldfs; + int status; + struct path path; + + oldfs = get_fs(); + set_fs(get_ds()); + + status = user_path_at(AT_FDCWD, newpath, 0, &path); + if (!status) { + struct inode *inode = path.dentry->d_inode; + memcpy(&dent->old_stat, &dent->new_stat, sizeof(dent->old_stat)); + generic_fillattr(inode, &dent->new_stat); + if (S_ISLNK(dent->new_stat.mode)) { + int len = dent->new_stat.size; + char *link; + status = -ENOMEM; + link = kmalloc(len + 1, GFP_MARS); + if (link) { + status = inode->i_op->readlink(path.dentry, link, len); + link[len] = '\0'; + if (status < 0 || + (dent->new_link && !strncmp(dent->new_link, link, len))) { + MARS_DBG("symlink free '%s' (%s) status = %d\n", link, dent->new_link ? dent->new_link : "", status); + kfree(link); + } else { + MARS_DBG("symlink new '%s' (%s) status = %d\n", link, dent->new_link ? dent->new_link : "", status); + if (dent->old_link) + kfree(dent->old_link); + dent->old_link = dent->new_link; + dent->new_link = link; + } + } + } + path_put(&path); + } + + set_fs(oldfs); + + if (dent->new_link) + MARS_IO("symlink '%s'\n", dent->new_link); + return status; +} + +static +int mars_filler(void *__buf, const char *name, int namlen, loff_t offset, + u64 ino, unsigned int d_type) +{ + struct mars_cookie *cookie = __buf; + struct mars_global *global = cookie->global; + struct list_head *anchor = &global->dent_anchor; + struct mars_dent *dent; + struct list_head *tmp; + struct mars_dent *best = NULL; + char *newpath; + int prefix = 0; + int pathlen; + int class; + int serial = 0; + + MARS_IO("ino = %llu len = %d offset = %lld type = %u\n", ino, namlen, offset, d_type); + + if (name[0] == '.') { + return 0; + } + + class = cookie->checker(cookie->path, name, namlen, d_type, &prefix, &serial); + if (class < 0) + return 0; + + pathlen = cookie->pathlen; + newpath = kmalloc(pathlen + namlen + 2, GFP_MARS); + if (unlikely(!newpath)) + goto err_mem0; + memcpy(newpath, cookie->path, pathlen); + newpath[pathlen++] = '/'; + memcpy(newpath + pathlen, name, namlen); + pathlen += namlen; + newpath[pathlen] = '\0'; + + MARS_IO("path = '%s'\n", newpath); + + for (tmp = anchor->next; tmp != anchor; tmp = tmp->next) { + int cmp; + dent = container_of(tmp, struct mars_dent, sub_link); + cmp = strcmp(dent->d_path, newpath); + if (!cmp) { + kfree(newpath); + return 0; + } + // keep the list sorted. find the next smallest member. + if ((dent->d_class < class || + (dent->d_class == class && + (dent->d_serial < serial || + (dent->d_serial == serial && + cmp < 0)))) + && + (!best || + best->d_class < dent->d_class || + (best->d_class == dent->d_class && + (best->d_serial < dent->d_serial || + (best->d_serial == dent->d_serial && + strcmp(best->d_path, dent->d_path) < 0))))) { + best = dent; + } + } + + dent = kzalloc(cookie->allocsize, GFP_MARS); + if (unlikely(!dent)) + goto err_mem1; + dent->d_name = kmalloc(namlen + 1, GFP_MARS); + if (unlikely(!dent->d_name)) + goto err_mem2; + + dent->d_type = d_type; + dent->d_class = class; + dent->d_serial = serial; + dent->d_parent = cookie->parent; + dent->d_depth = cookie->depth; + + memcpy(dent->d_name, name, namlen); + dent->d_name[namlen] = '\0'; + dent->d_namelen = namlen; + dent->d_rest = dent->d_name + prefix; + + dent->d_path = newpath; + dent->d_pathlen = pathlen; + + down(&global->mutex); + if (best) { + list_add(&dent->sub_link, &best->sub_link); + } else { + list_add_tail(&dent->sub_link, anchor); + } + up(&global->mutex); + return 0; + +err_mem2: + kfree(dent); +err_mem1: + kfree(newpath); +err_mem0: + return -ENOMEM; +} + +static int _mars_dent_work(struct mars_cookie *cookie) +{ + struct file *f; + mm_segment_t oldfs; + int status = 0; + + oldfs = get_fs(); + set_fs(get_ds()); + f = filp_open(cookie->path, O_DIRECTORY | O_RDONLY, 0); + set_fs(oldfs); + if (unlikely(IS_ERR(f))) { + return PTR_ERR(f); + } + + for (;;) { + status = vfs_readdir(f, mars_filler, cookie); + MARS_IO("vfs_readdir() status = %d\n", status); + if (status <= 0) + break; + } + + filp_close(f, NULL); + return status; +} + +int mars_dent_work(struct mars_global *global, char *dirname, int allocsize, mars_dent_checker checker, mars_dent_worker worker, void *buf, int maxdepth) +{ + static int version = 0; + struct mars_cookie cookie = { + .global = global, + .checker = checker, + .path = dirname, + .pathlen = strlen(dirname), + .allocsize = allocsize, + .depth = 0, + }; + struct list_head *tmp; + int rounds = 0; + int status; + int total_status = 0; + bool found_dir; + + version++; + total_status = _mars_dent_work(&cookie); + + if (total_status || !worker) { + goto done; + } + +restart: + found_dir = false; + + /* First, get all the inode information in a separate pass + * before starting work. + * The separate pass is necessary because some dents may + * forward-reference other dents, and it would be a pity if + * some inodes were not available or were outdated. + */ + for (tmp = global->dent_anchor.next; tmp != &global->dent_anchor; tmp = tmp->next) { + struct mars_dent *dent = container_of(tmp, struct mars_dent, sub_link); + // treat any member only once during this invocation + if (dent->d_version == version) + continue; + dent->d_version = version; + + MARS_IO("reading inode '%s'\n", dent->d_path); + status = get_inode(dent->d_path, dent); + total_status |= status; + + // recurse into subdirectories by inserting into the flat list + if (S_ISDIR(dent->new_stat.mode) && dent->d_depth <= maxdepth) { + struct mars_cookie sub_cookie = { + .global = global, + .checker = checker, + .path = dent->d_path, + .pathlen = dent->d_pathlen, + .allocsize = allocsize, + .parent = dent, + .depth = dent->d_depth + 1, + }; + found_dir = true; + status = _mars_dent_work(&sub_cookie); + total_status |= status; + if (status < 0) { + MARS_ERR("forward: status %d on '%s'\n", status, dent->d_path); + } + } + } + if (found_dir && ++rounds < 10) { + goto restart; + } + + /* Forward pass. + */ + for (tmp = global->dent_anchor.next; tmp != &global->dent_anchor; tmp = tmp->next) { + struct mars_dent *dent = container_of(tmp, struct mars_dent, sub_link); + MARS_IO("forward treat '%s'\n", dent->d_path); + status = worker(buf, dent, false); + total_status |= status; + if (status < 0) + continue; + if (status < 0) { + MARS_ERR("backwards: status %d on '%s'\n", status, dent->d_path); + } + } + + /* Backward pass. + */ + for (tmp = global->dent_anchor.prev; tmp != &global->dent_anchor; tmp = tmp->prev) { + struct mars_dent *dent = container_of(tmp, struct mars_dent, sub_link); + MARS_IO("backward treat '%s'\n", dent->d_path); + status = worker(buf, dent, true); + total_status |= status; + if (status < 0) { + MARS_ERR("backwards: status %d on '%s'\n", status, dent->d_path); + } + } + +done: + return total_status; +} +EXPORT_SYMBOL_GPL(mars_dent_work); + +struct mars_dent *_mars_find_dent(struct mars_global *global, const char *path) +{ + struct mars_dent *res = NULL; + struct list_head *tmp; + + for (tmp = global->dent_anchor.next; tmp != &global->dent_anchor; tmp = tmp->next) { + struct mars_dent *tmp_dent = container_of(tmp, struct mars_dent, sub_link); + if (!strcmp(tmp_dent->d_path, path)) { + res = tmp_dent; + break; + } + } + + return res; +} +EXPORT_SYMBOL_GPL(_mars_find_dent); + +struct mars_dent *mars_find_dent(struct mars_global *global, const char *path) +{ + struct mars_dent *res; + down(&global->mutex); + res = _mars_find_dent(global, path); + up(&global->mutex); + return res; +} +EXPORT_SYMBOL_GPL(mars_find_dent); + +void mars_dent_free(struct mars_dent *dent) +{ + int i; + + list_del(&dent->sub_link); + + for (i = 0; i < MARS_ARGV_MAX; i++) { + if (dent->d_argv[i]) + kfree(dent->d_argv[i]); + } + if (dent->d_args) + kfree(dent->d_args); + if (dent->d_private) + kfree(dent->d_private); + if (dent->old_link) + kfree(dent->old_link); + if (dent->new_link) + kfree(dent->new_link); + kfree(dent->d_name); + kfree(dent->d_path); + kfree(dent); +} +EXPORT_SYMBOL_GPL(mars_dent_free); + +void mars_dent_free_all(struct list_head *anchor) +{ + while (!list_empty(anchor)) { + struct mars_dent *dent; + dent = container_of(anchor->prev, struct mars_dent, sub_link); + mars_dent_free(dent); + } +} +EXPORT_SYMBOL_GPL(mars_dent_free_all); + + +struct mars_brick *mars_find_brick(struct mars_global *global, const void *brick_type, const char *path) +{ + struct list_head *tmp; + + if (!global || !path) + return NULL; + + down(&global->mutex); + + for (tmp = global->brick_anchor.next; tmp != &global->brick_anchor; tmp = tmp->next) { + struct mars_brick *test = container_of(tmp, struct mars_brick, brick_link); + if (!strcmp(test->brick_path, path)) { + up(&global->mutex); + if (brick_type && test->type != brick_type) { + MARS_ERR("bad brick type\n"); + return NULL; + } + return test; + } + } + + up(&global->mutex); + + return NULL; +} +EXPORT_SYMBOL_GPL(mars_find_brick); + +struct mars_brick *mars_make_brick(struct mars_global *global, const void *_brick_type, const char *path, const char *_name) +{ + const char *name = kstrdup(_name, GFP_MARS); + const char *names[] = { name }; + const struct generic_brick_type *brick_type = _brick_type; + const struct generic_input_type **input_types; + const struct generic_output_type **output_types; + struct mars_brick *res; + int size; + int i; + int status; + + if (!name) { + MARS_ERR("cannot allocate space for name\n"); + return NULL; + } + + size = brick_type->brick_size + + (brick_type->max_inputs + brick_type->max_outputs) * sizeof(void*); + input_types = brick_type->default_input_types; + for (i = 0; i < brick_type->max_inputs; i++) { + const struct generic_input_type *type = *input_types++; + if (unlikely(!type)) { + MARS_ERR("input_type %d is missing\n", i); + goto err_name; + } + size += type->input_size; + } + output_types = brick_type->default_output_types; + for (i = 0; i < brick_type->max_outputs; i++) { + const struct generic_output_type *type = *output_types++; + if (unlikely(!type)) { + MARS_ERR("output_type %d is missing\n", i); + goto err_name; + } + size += type->output_size; + } + + res = kzalloc(size, GFP_MARS); + if (!res) { + MARS_ERR("cannot grab %d bytes for brick type '%s'\n", size, brick_type->type_name); + goto err_name; + } + res->brick_path = kstrdup(path, GFP_MARS); + res->global = global; + if (!res->brick_path) { + MARS_ERR("cannot grab memory for path '%s'\n", path); + goto err_res; + } + + status = generic_brick_init_full(res, size, brick_type, NULL, NULL, names); + MARS_DBG("brick '%s' init '%s' '%s' (status=%d)\n", brick_type->type_name, path, name, status); + if (status < 0) { + MARS_ERR("cannot init brick %s\n", brick_type->type_name); + goto err_path; + } + + /* Immediately make it visible, regardless of internal state. + * Switching on / etc must be done separately. + */ + down(&global->mutex); + list_add(&res->brick_link, &global->brick_anchor); + up(&global->mutex); + + return res; + +err_path: + kfree(res->brick_path); +err_res: + kfree(res); +err_name: + kfree(name); + return NULL; +} +EXPORT_SYMBOL_GPL(mars_make_brick); + + +///////////////////////////////////////////////////////////////////// + +// meta descriptions + +const struct meta mars_info_meta[] = { + META_INI(current_size, struct mars_info, FIELD_INT), + META_INI(transfer_order, struct mars_info, FIELD_INT), + META_INI(transfer_size, struct mars_info, FIELD_INT), + {} +}; +EXPORT_SYMBOL_GPL(mars_info_meta); + +const struct meta mars_mref_meta[] = { + META_INI(ref_pos, struct mref_object, FIELD_INT), + META_INI(ref_len, struct mref_object, FIELD_INT), + META_INI(ref_may_write, struct mref_object, FIELD_INT), + META_INI(ref_flags, struct mref_object, FIELD_INT), + META_INI(ref_rw, struct mref_object, FIELD_INT), + META_INI(ref_id, struct mref_object, FIELD_INT), + META_INI(_ref_cb.cb_error, struct mref_object, FIELD_INT), + {} +}; +EXPORT_SYMBOL_GPL(mars_mref_meta); + +const struct meta mars_timespec_meta[] = { + META_INI(tv_sec, struct timespec, FIELD_INT), + META_INI(tv_nsec, struct timespec, FIELD_INT), + {} +}; +EXPORT_SYMBOL_GPL(mars_timespec_meta); + +const struct meta mars_kstat_meta[] = { + META_INI(ino, struct kstat, FIELD_INT), + META_INI(mode, struct kstat, FIELD_INT), + META_INI(size, struct kstat, FIELD_INT), + META_INI_SUB(atime, struct kstat, mars_timespec_meta), + META_INI_SUB(mtime, struct kstat, mars_timespec_meta), + META_INI_SUB(ctime, struct kstat, mars_timespec_meta), + META_INI(blksize, struct kstat, FIELD_INT), + {} +}; +EXPORT_SYMBOL_GPL(mars_kstat_meta); + +const struct meta mars_dent_meta[] = { + META_INI(d_name, struct mars_dent, FIELD_STRING), + META_INI(d_rest, struct mars_dent, FIELD_STRING), + META_INI(d_path, struct mars_dent, FIELD_STRING), + META_INI(d_namelen, struct mars_dent, FIELD_INT), + META_INI(d_pathlen, struct mars_dent, FIELD_INT), + META_INI(d_type, struct mars_dent, FIELD_INT), + META_INI(d_class, struct mars_dent, FIELD_INT), + META_INI(d_version, struct mars_dent, FIELD_INT), + META_INI_SUB(new_stat,struct mars_dent, mars_kstat_meta), + META_INI_SUB(old_stat,struct mars_dent, mars_kstat_meta), + META_INI(new_link, struct mars_dent, FIELD_STRING), + META_INI(old_link, struct mars_dent, FIELD_STRING), + META_INI(d_args, struct mars_dent, FIELD_STRING), + META_INI(d_argv[0], struct mars_dent, FIELD_STRING), + META_INI(d_argv[1], struct mars_dent, FIELD_STRING), + META_INI(d_argv[2], struct mars_dent, FIELD_STRING), + META_INI(d_argv[3], struct mars_dent, FIELD_STRING), + {} +}; +EXPORT_SYMBOL_GPL(mars_dent_meta); + +///////////////////////////////////////////////////////////////////// + +// init stuff + static int __init init_mars(void) { - printk(MARS_INFO "init_mars()\n"); + MARS_INF("init_mars()\n"); return 0; } static void __exit exit_mars(void) { - printk(MARS_INFO "exit_mars()\n"); + MARS_INF("exit_mars()\n"); + if (id) { + kfree(id); + id = NULL; + } } MODULE_DESCRIPTION("MARS block storage"); diff --git a/mars_if_device.c b/mars_if_device.c index 737adcca..f5140ce6 100644 --- a/mars_if_device.c +++ b/mars_if_device.c @@ -153,10 +153,10 @@ static int if_device_make_request(struct request_queue *q, struct bio *bio) bio->bi_check2 = 0; bio->bi_check3 = 0; - /* THIS IS PROVISIONARY + /* FIXME: THIS IS PROVISIONARY (use event instead) */ - while (unlikely(!brick->is_active)) { - msleep(100); + while (unlikely(!brick->power.led_on)) { + msleep(2 * HZ); } _CHECK_ATOMIC(&bio->bi_comp_cnt, !=, 0); @@ -369,7 +369,7 @@ static int if_device_brick_destruct(struct if_device_brick *brick) return 0; } -static int if_device_switch(struct if_device_brick *brick, bool state) +static int if_device_switch(struct if_device_brick *brick) { struct if_device_input *input = brick->inputs[0]; struct request_queue *q; @@ -379,71 +379,91 @@ static int if_device_switch(struct if_device_brick *brick, bool state) unsigned long capacity; int status; - //MARS_DBG("1\n"); - - status = GENERIC_INPUT_CALL(input, mars_get_info, &info); - if (status < 0) { - MARS_ERR("cannot get device info, status=%d\n", status); - return status; - } - capacity = info.current_size >> 9; // TODO: make this dynamic + if (brick->power.button) { + mars_power_led_off((void*)brick, false); + status = GENERIC_INPUT_CALL(input, mars_get_info, &info); + if (status < 0) { + MARS_ERR("cannot get device info, status=%d\n", status); + return status; + } + capacity = info.current_size >> 9; // TODO: make this dynamic + + q = blk_alloc_queue(GFP_MARS); + if (!q) { + MARS_ERR("cannot allocate device request queue\n"); + return -ENOMEM; + } + q->queuedata = input; + input->q = q; + + //MARS_DBG("2\n"); + disk = alloc_disk(1); + if (!disk) { + MARS_ERR("cannot allocate gendisk\n"); + return -ENOMEM; + } - q = blk_alloc_queue(GFP_MARS); - if (!q) { - MARS_ERR("cannot allocate device request queue\n"); - return -ENOMEM; - } - q->queuedata = input; - input->q = q; - - //MARS_DBG("2\n"); - disk = alloc_disk(1); - if (!disk) { - MARS_ERR("cannot allocate gendisk\n"); - return -ENOMEM; - } - - //MARS_DBG("3\n"); - minor = device_minor++; //TODO: protect against races (e.g. atomic_t) - disk->queue = q; - disk->major = MARS_MAJOR; //TODO: make this dynamic for >256 devices - disk->first_minor = minor; - disk->fops = &if_device_blkdev_ops; - sprintf(disk->disk_name, "mars%d", minor); - MARS_DBG("created device name %s\n", disk->disk_name); - disk->private_data = input; - set_capacity(disk, capacity); - - blk_queue_make_request(q, if_device_make_request); - blk_queue_max_segment_size(q, MARS_MAX_SEGMENT_SIZE); - blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); - q->unplug_fn = if_device_unplug; - sema_init(&input->kick_sem, 1); - spin_lock_init(&input->req_lock); - q->queue_lock = &input->req_lock; // needed! - //blk_queue_ordered(q, QUEUE_ORDERED_DRAIN, NULL);//??? - - //MARS_DBG("4\n"); - input->bdev = bdget(MKDEV(disk->major, minor)); - /* we have no partitions. we contain only ourselves. */ - input->bdev->bd_contains = input->bdev; + //MARS_DBG("3\n"); + minor = device_minor++; //TODO: protect against races (e.g. atomic_t) + disk->queue = q; + disk->major = MARS_MAJOR; //TODO: make this dynamic for >256 devices + disk->first_minor = minor; + disk->fops = &if_device_blkdev_ops; + //snprintf(disk->disk_name, sizeof(disk->disk_name), "mars%d", minor); + snprintf(disk->disk_name, sizeof(disk->disk_name), "mars/%s", brick->brick_name); + MARS_DBG("created device name %s\n", disk->disk_name); + disk->private_data = input; + set_capacity(disk, capacity); + + blk_queue_make_request(q, if_device_make_request); + blk_queue_max_segment_size(q, MARS_MAX_SEGMENT_SIZE); + blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); + q->unplug_fn = if_device_unplug; + sema_init(&input->kick_sem, 1); + spin_lock_init(&input->req_lock); + q->queue_lock = &input->req_lock; // needed! + //blk_queue_ordered(q, QUEUE_ORDERED_DRAIN, NULL);//??? + + //MARS_DBG("4\n"); + input->bdev = bdget(MKDEV(disk->major, minor)); + /* we have no partitions. we contain only ourselves. */ + input->bdev->bd_contains = input->bdev; #if 0 // ??? - q->backing_dev_info.congested_fn = mars_congested; - q->backing_dev_info.congested_data = input; + q->backing_dev_info.congested_fn = mars_congested; + q->backing_dev_info.congested_data = input; #endif #if 0 // ??? - blk_queue_merge_bvec(q, mars_merge_bvec); + blk_queue_merge_bvec(q, mars_merge_bvec); #endif - INIT_LIST_HEAD(&input->plug_anchor); + INIT_LIST_HEAD(&input->plug_anchor); - // point of no return - //MARS_DBG("99999\n"); - add_disk(disk); - input->disk = disk; - //set_device_ro(input->bdev, 0); // TODO: implement modes - brick->is_active = true; + // point of no return + //MARS_DBG("99999\n"); + add_disk(disk); + input->disk = disk; + //set_device_ro(input->bdev, 0); // TODO: implement modes + mars_power_led_on((void*)brick, true); + } else { + mars_power_led_on((void*)brick, false); + if (input->bdev) { + bdput(input->bdev); + input->bdev = NULL; + } + disk = input->disk; + if (disk) { + q = disk->queue; + del_gendisk(input->disk); + put_disk(input->disk); + input->disk = NULL; + if (q) { + blk_cleanup_queue(q); + } + } + //........ + mars_power_led_off((void*)brick, true); + } return 0; } @@ -518,7 +538,7 @@ EXPORT_SYMBOL_GPL(if_device_brick_type); static void __exit exit_if_device(void) { int status; - printk(MARS_INFO "exit_if_device()\n"); + MARS_INF("exit_if_device()\n"); status = if_device_unregister_brick_type(); unregister_blkdev(DRBD_MAJOR, "mars"); } @@ -529,7 +549,7 @@ static int __init init_if_device(void) (void)if_device_aspect_types; // not used, shut up gcc - printk(MARS_INFO "init_if_device()\n"); + MARS_INF("init_if_device()\n"); status = register_blkdev(DRBD_MAJOR, "mars"); if (status) return status; diff --git a/mars_if_device.h b/mars_if_device.h index 213ee03d..f21f6a9d 100644 --- a/mars_if_device.h +++ b/mars_if_device.h @@ -42,7 +42,6 @@ struct if_device_output { struct if_device_brick { MARS_BRICK(if_device); - bool is_active; struct if_device_output hidden_output; }; diff --git a/mars_light.c b/mars_light.c new file mode 100644 index 00000000..81e4faf6 --- /dev/null +++ b/mars_light.c @@ -0,0 +1,1850 @@ +// (c) 2011 Thomas Schoebel-Theuer / 1&1 Internet AG + +//#define BRICK_DEBUGGING +//#define MARS_DEBUGGING +//#define IO_DEBUGGING // here means: display full statistics + +#include +#include +#include +#include + +#include +#include +#include + +#define _STRATEGY +#include "mars.h" + +#include +#include + +// used brick types +#include "mars_server.h" +#include "mars_client.h" +#include "mars_copy.h" +#include "mars_device_aio.h" +#include "mars_trans_logger.h" +#include "mars_if_device.h" + +#define USE_TRANS_LOGGER // disable this ONLY FOR TESTING + +static struct task_struct *main_thread = NULL; + +struct light_dent { + MARS_DENT(light_dent); +}; + +typedef int (*light_worker_fn)(void *buf, struct light_dent *dent); + +struct light_class { + char *cl_name; + int cl_len; + char cl_type; + bool cl_hostcontext; + bool cl_serial; + int cl_father; + light_worker_fn cl_forward; + light_worker_fn cl_backward; +}; + +static +struct mars_brick *make_brick(struct mars_global *global, const void *_brick_type, const char *path, const char *name) +{ + struct mars_brick *res; + MARS_DBG("type = '%s' path = '%s' name = '%s'\n", ((struct generic_brick_type*)_brick_type)->type_name, path, name); + res = mars_make_brick(global, _brick_type, path, name); + MARS_DBG("brick = %p\n", res); + if (res) { + mars_trigger(); + } + return res; +} + +static +int kill_brick(struct mars_brick *brick, int max_level) +{ + int i; + int status; + + if (!brick) { + MARS_ERR("bad brick parameter\n"); + return -EINVAL; + } + + // first check whether the brick is in use somewhere + for (i = 0; i < brick->nr_outputs; i++) { + if (brick->outputs[i]->nr_connected > 0) { + MARS_DBG("brick '%s' not killable, output %i is used\n", brick->brick_name, i); + return -EEXIST; + } + } + + MARS_DBG("===> killing brick name = '%s'\n", brick->brick_name); + + // start shutdown + mars_power_button((void*)brick, false); + status = brick->ops->brick_switch(brick); + + MARS_DBG("kill '%s' status = %d led_off = %d\n", brick->brick_name, status, brick->power.led_off); + + // wait until clean shutdown + if (status >= 0 && brick->power.led_off) { + int count = 0; + struct mars_brick *prev[brick->nr_inputs]; + + // remove from the global list => no longer visible + down(&brick->global->mutex); + list_del_init(&brick->brick_link); + up(&brick->global->mutex); + + /* Disconnect all inputs. + * This must not start earlier, because during shutdown + * the inputs could be needed for cleanup operations etc. + */ + for (i = 0; i < brick->nr_inputs; i++) { + if (brick->inputs[i]->connect) { + prev[count++] = brick->inputs[i]->connect->brick; + (void)generic_disconnect((void*)brick->inputs[i]); + } + } + +#if 1 + /* recursively kill predecessors + */ + if (max_level > 0) { + struct mars_brick *old = NULL; + for (i = 0; i < count; i++) { + if (!prev[i] || prev[i] == old || list_empty(&prev[i]->brick_link)) + continue; + old = prev[i]; + status |= kill_brick(prev[i], max_level - 1); +#if 1 + msleep(500); +#endif + } + } +#endif + + /* This runs unchecked and may therefore leave memory remains, + * but we currently have no separate list for "zombies". + * TODO: do better. + */ +#if 0 // TODO: debug locking crash + (void)generic_brick_exit_full((void*)brick); +#endif + mars_trigger(); + } + return status; +} + + +/////////////////////////////////////////////////////////////////////// + +// internal helpers + +static +void normalized_path(char *res, int len, struct light_dent *father, const char *prefix, const char *suffix) +{ + char *test; + int prelen; + + test = strchr(prefix, '+'); + if (test) { + test++; + prelen = test - prefix; + if (prelen >= len) + goto done; + memcpy(res, prefix, prelen); + res += prelen; + len -= prelen; + prefix = test; + } + + if (father && *prefix != '/') { + prelen = strlen(father->d_path); + if (prelen+1 >= len) + goto done; + memcpy(res, father->d_path, prelen); + res += prelen; + *res++ = '/'; + len -= prelen+1; + } + + prelen = strlen(prefix); + if (prelen >= len) + goto done; + memcpy(res, prefix, prelen); + res += prelen; + len -= prelen; + + prelen = strlen(suffix); + if (prelen >= len) + goto done; + + strncpy(res, suffix, len); + res += prelen; +done: + *res = '\0'; +} + +static +struct mars_brick *find_other(struct mars_global *global, const void *brick_type, struct light_dent *father, const char *prefix, const char *suffix) +{ + int len = (father ? father->d_pathlen : 0) + + strlen(prefix) + + strlen(suffix) + + 2; + char fullpath[len]; + + normalized_path(fullpath, len, father, prefix, suffix); + MARS_DBG("searching for '%s'\n", fullpath); + return mars_find_brick(global, brick_type, fullpath); +} + +/* Create a new brick and connect its inputs to a set of predecessors. + * Before starting that, check whether all predecessors exist and are healthy. + */ +static +struct mars_brick *make_all(struct mars_global *global, + const void *new_brick_type, + const char *new_path, + const char *new_name, + struct light_dent *father, + const void *brick_type[], + const char *prefix[], + const char *suffix[], + int count, + bool switch_on) +{ + struct mars_brick *brick; + struct mars_brick *prev[count]; + int status = 0; + int i; + int len = (father ? father->d_pathlen : 0) + + strlen(new_path) + + 2; + char fullpath[len]; + + normalized_path(fullpath, len, father, new_path, ""); + + // check whether all previous bricks exist and are healthy + for (i = 0; i < count; i++) { + prev[i] = find_other(global, brick_type[i], father, prefix[i], suffix[i]); + if (!prev[i]) { + MARS_DBG("previous brick '%s' '%s' '%s' does not exist\n", father ? father->d_path : "", prefix[i], suffix[i]); + return NULL; + } + if (!prev[i]->power.led_on) { + MARS_DBG("previous brick '%s' '%s' '%s' not healthy\n", father ? father->d_path : "", prefix[i], suffix[i]); + return NULL; + } + } + + // special case for client brick: treat network indirection + if (new_brick_type == &client_brick_type) { + struct mars_dent *test; + char path[256]; + snprintf(path, sizeof(path), "/mars/ips/ip-%s", new_name); + test = mars_find_dent(global, path); + if (test && test->new_link) { + MARS_DBG("translation '%s' => '%s'\n", new_name, test->new_link); + new_name = test->new_link; + } + } + + // create it... + brick = make_brick(global, new_brick_type, fullpath, new_name); + if (unlikely(!brick)) { + MARS_DBG("creation failed '%s' '%s'\n", fullpath, new_name); + return NULL; + } + + // special case for aio: file name is treated different IMPROVEME! + if (new_brick_type == &device_aio_brick_type) { + brick->outputs[0]->output_name = brick->brick_name; + } + + // connect the wires + for (i = 0; i < count; i++) { + status = generic_connect((void*)brick->inputs[i], (void*)prev[i]->outputs[0]); + if (unlikely(status < 0)) { + MARS_ERR("'%s' '%s' cannot connect input %d\n", fullpath, new_name, i); + goto err; + } + } + if (!brick->ops) { + MARS_ERR("cannot start '%s' '%s'\n", fullpath, new_name); + goto err; + } + + // switch on (may fail silently, but responsibility is at the workers) + if (switch_on) { + mars_power_button((void*)brick, true); + status = brick->ops->brick_switch(brick); + MARS_DBG("switch on status = %d\n", status); + } + return brick; + +err: + status = kill_brick(brick, 0); + if (status >= 0) { + brick = NULL; + } + return brick; +} + +#define MARS_DELIM ',' + +static int _parse_args(struct light_dent *dent, char *str, int count) +{ + int i; + int status = -EINVAL; + if (!str) + goto done; + if (!dent->d_args) { + dent->d_args = kstrdup(str, GFP_MARS); + if (!dent->d_args) { + status = -ENOMEM; + goto done; + } + } + for (i = 0; i < count; i++) { + char *tmp; + int len; + if (!*str) + goto done; + if (i == count-1) { + len = strlen(str); + } else { + char *tmp = strchr(str, MARS_DELIM); + if (!tmp) + goto done; + len = (tmp - str); + } + tmp = kzalloc(len+1, GFP_MARS); + if (!tmp) { + status = -ENOMEM; + goto done; + } + if (dent->d_argv[i]) { + kfree(dent->d_argv[i]); + } + dent->d_argv[i] = tmp; + strncpy(dent->d_argv[i], str, len); + dent->d_argv[i][len] = '\0'; + + str += len; + if (i != count-1) + str++; + } + status = 0; +done: + if (status < 0) { + MARS_ERR("bad syntax '%s' (should have %d args), status = %d\n", dent->d_args ? dent->d_args : "", count, status); + } + return status; +} + + +/////////////////////////////////////////////////////////////////////// + +// remote workers + +struct mars_peerinfo { + char *peer; + char *path; + struct socket *socket; + struct task_struct *thread; + wait_queue_head_t event; + light_worker_fn worker; + int maxdepth; +}; + +static int run_bones(void *buf, struct light_dent *dent) +{ + int status = 0; + //struct mars_peerinfo *peer = buf; + + if (S_ISDIR(dent->new_stat.mode)) { + if (strncmp(dent->d_name, "resource-", 9)) { + MARS_DBG("ignoring directory '%s'\n", dent->d_path); + return 0; + } + status = mars_mkdir(dent->d_path); + MARS_DBG("create directory '%s' status = %d\n", dent->d_path, status); + } else if (S_ISLNK(dent->new_stat.mode) && dent->new_link) { + status = mars_symlink(dent->new_link, dent->d_path, &dent->new_stat.mtime); + MARS_DBG("create symlink '%s' -> '%s' status = %d\n", dent->d_path, dent->new_link, status); + } else { + MARS_DBG("ignoring '%s'\n", dent->d_path); + } + return status; +} + +/////////////////////////////////////////////////////////////////////// + +// remote working infrastructure + +static void _peer_cleanup(struct mars_peerinfo *peer) +{ + if (peer->socket) { + kernel_sock_shutdown(peer->socket, SHUT_WR); + peer->socket = NULL; + } + //... +} + +static int remote_thread(void *data) +{ + struct mars_peerinfo *peer = data; + struct sockaddr_storage sockaddr = {}; + int status; + + if (!peer) + return -1; + + MARS_INF("-------- remote thread starting on peer '%s'\n", peer->peer); + + //fake_mm(); + + status = mars_create_sockaddr(&sockaddr, peer->peer); + if (unlikely(status < 0)) { + MARS_ERR("unusable remote address '%s'\n", peer->peer); + goto done; + } + + while (!kthread_should_stop()) { + LIST_HEAD(tmp_list); + struct mars_cmd cmd = { + .cmd_code = CMD_GETENTS, + .cmd_str1 = peer->path, + .cmd_int1 = peer->maxdepth, + }; + + if (!peer->socket) { + status = mars_create_socket(&peer->socket, &sockaddr, false); + if (unlikely(status < 0)) { + peer->socket = NULL; + MARS_INF("no connection to '%s'\n", peer->peer); + msleep(5000); + continue; + } + MARS_DBG("successfully opened socket to '%s'\n", peer->peer); + continue; + } + + status = mars_send_struct(&peer->socket, &cmd, mars_cmd_meta); + if (unlikely(status < 0)) { + MARS_ERR("communication error on send, status = %d\n", status); + _peer_cleanup(peer); + msleep(5000); + continue; + } + + status = mars_recv_dent_list(&peer->socket, &tmp_list); + if (unlikely(status < 0)) { + MARS_ERR("communication error on receive, status = %d\n", status); + _peer_cleanup(peer); + msleep(5000); + continue; + } + + MARS_DBG("AHA!!!!!!!!!!!!!!!!!!!!\n"); + + { + struct list_head *tmp; + for (tmp = tmp_list.next; tmp != &tmp_list; tmp = tmp->next) { + struct light_dent *dent = container_of(tmp, struct light_dent, sub_link); + if (!dent->d_path) { + MARS_DBG("NULL\n"); + continue; + } + MARS_DBG("path = '%s'\n", dent->d_path); + if (!peer->worker) + continue; + status = peer->worker(peer, dent); + } + } + + //... + + mars_dent_free_all(&tmp_list); + + if (!kthread_should_stop()) + msleep(10 * 1000); + } + + MARS_INF("-------- remote thread terminating\n"); + + _peer_cleanup(peer); + +done: + //cleanup_mm(); + return 0; +} + +/////////////////////////////////////////////////////////////////////// + +// helpers for worker functions + +static int _kill_peer(void *buf, struct light_dent *dent) +{ + struct mars_global *global = buf; + struct mars_peerinfo *peer = dent->d_private; + + if (global->global_power.button) { + return 0; + } + if (!peer) { + return 0; + } + if (!peer->thread) { + MARS_ERR("oops, remote thread is not running - doing cleanup myself\n"); + _peer_cleanup(peer); + dent->d_private = NULL; + return -1; + + } + kthread_stop(peer->thread); + dent->d_private = NULL; + return 0; +} + +static int _make_peer(void *buf, struct light_dent *dent, char *mypeer, char *path, light_worker_fn worker) +{ + static int serial = 0; + struct mars_global *global = buf; + struct mars_peerinfo *peer; + int status = 0; + + if (!global->global_power.button || !dent->d_parent || !dent->new_link) { + return 0; + } + if (!mypeer) { + status = _parse_args(dent, dent->new_link, 1); + if (status < 0) + goto done; + mypeer = dent->d_argv[0]; + } + + MARS_DBG("peer '%s'\n", mypeer); + if (!dent->d_private) { + dent->d_private = kzalloc(sizeof(struct mars_peerinfo), GFP_MARS); + if (!dent->d_private) { + MARS_ERR("no memory for peer structure\n"); + return -1; + } + + peer = dent->d_private; + peer->peer = mypeer; + peer->path = path; + peer->worker = worker; + peer->maxdepth = 2; + init_waitqueue_head(&peer->event); + } + peer = dent->d_private; + if (!peer->thread) { + peer->thread = kthread_create(remote_thread, peer, "mars_remote%d", serial++); + if (unlikely(IS_ERR(peer->thread))) { + MARS_ERR("cannot start peer thread, status = %d\n", (int)PTR_ERR(peer->thread)); + peer->thread = NULL; + return -1; + } + wake_up_process(peer->thread); + } + +done: + return status; +} + +static int _kill_remote(void *buf, struct light_dent *dent) +{ + return _kill_peer(buf, dent); +} + +static int _make_remote(void *buf, struct light_dent *dent) +{ + return _make_peer(buf, dent, NULL, "/mars", NULL); +} + +static int kill_scan(void *buf, struct light_dent *dent) +{ + return _kill_peer(buf, dent); +} + +static int make_scan(void *buf, struct light_dent *dent) +{ + return _make_peer(buf, dent, "/mars/ips", "/mars", run_bones); +} + + +static +int _kill_default(void *buf, struct light_dent *dent, int maxlevel) +{ + struct mars_global *global = buf; + struct mars_brick *brick; + + if (global->global_power.button) { + return 0; + } + brick = mars_find_brick(global, NULL, dent->d_path); + if (!brick) { + return 0; + } + return kill_brick(brick, maxlevel); +} + + +static +int kill_default(void *buf, struct light_dent *dent) +{ + return _kill_default(buf, dent, 0); +} + +static +int kill_all(void *buf, struct light_dent *dent) +{ + return _kill_default(buf, dent, 999); +} + + +/////////////////////////////////////////////////////////////////////// + +// handlers / helpers for logfile rotation + +struct mars_rotate { + struct light_dent *replay_link; + struct light_dent *aio_dent; + struct device_aio_brick *aio_brick; + struct mars_info aio_info; + struct trans_logger_brick *trans_brick; + struct light_dent *relevant_log; + struct light_dent *current_log; + struct light_dent *prev_log; + struct light_dent *next_log; + long long last_jiffies; + loff_t start_pos; + loff_t end_pos; + int max_sequence; + bool has_error; + bool do_replay; + bool is_primary; + bool create_once; +}; + +static +void _create_new_logfile(char *path) +{ + struct file *f; + const int flags = O_RDWR | O_CREAT | O_EXCL; + const int prot = 0600; + mm_segment_t oldfs; + + oldfs = get_fs(); + set_fs(get_ds()); + f = filp_open(path, flags, prot); + set_fs(oldfs); + if (f) { + filp_close(f, NULL); + MARS_DBG("created empty logfile '%s'\n", path); + mars_trigger(); + } +} + +static +int _update_link(struct mars_rotate *rot, struct light_dent *parent, int sequence, loff_t pos) +{ + struct timespec now = {}; + char old[128] = {}; + char new[128] = {}; + int status = 0; + + + snprintf(old, sizeof(old), "log-%09d-%s,%lld", sequence, my_id(), pos); + snprintf(new, sizeof(new), "%s/replay-%s", parent->d_path, my_id()); + + get_lamport(&now); + status = mars_symlink(old, new, &now); + if (status < 0) { + MARS_ERR("cannot create symlink '%s' -> '%s' status = %d\n", old, new, status); + } else { + MARS_DBG("make symlink '%s' -> '%s' status = %d\n", old, new, status); + } + if (status >= 0) { + rot->last_jiffies = jiffies; + } + return status; +} + +/* This must be called once at every round of logfile checking. + */ +static +int make_log_init(void *buf, struct light_dent *parent) +{ + struct mars_global *global = buf; + struct mars_brick *aio_brick; + struct mars_brick *trans_brick; + struct mars_rotate *rot = parent->d_private; + struct light_dent *replay_link; + struct light_dent *aio_dent; + struct mars_output *output; + char tmp[128] = {}; + int status; + + if (!rot) { + rot = kzalloc(sizeof(struct mars_rotate), GFP_MARS); + parent->d_private = rot; + if (!rot) { + MARS_ERR("cannot allocate rot structure\n"); + status = -ENOMEM; + goto done; + } + } + + rot->replay_link = NULL; + rot->aio_dent = NULL; + rot->aio_brick = NULL; + rot->relevant_log = NULL; + rot->prev_log = NULL; + rot->next_log = NULL; + rot->max_sequence = 0; + rot->has_error = false; + + /* Fetch the replay status symlink. + * It must exist, and its value will control everything. + */ + normalized_path(tmp, sizeof(tmp), parent, "replay-", my_id()); + + replay_link = (void*)mars_find_dent(global, tmp); + if (!replay_link || !replay_link->new_link) { + MARS_ERR("replay status symlink '%s' does not exist (%p)\n", tmp, replay_link); + status = -ENOENT; + goto done; + } + + status = _parse_args(replay_link, replay_link->new_link, 2); + if (status < 0) { + goto done; + } + rot->replay_link = replay_link; + + /* Fetch the referenced AIO dentry. + */ + normalized_path(tmp, sizeof(tmp), parent, replay_link->d_argv[0], ""); + + aio_dent = (void*)mars_find_dent(global, tmp); + if (unlikely(!aio_dent)) { + MARS_ERR("logfile '%s' does not exist\n", tmp); + status = -ENOENT; + if (rot->is_primary) { // try to create an empty logfile + _create_new_logfile(tmp); + } + goto done; + } + rot->aio_dent = aio_dent; + + /* Fetch / make the AIO brick instance + */ + aio_brick = mars_find_brick(global, &device_aio_brick_type, tmp); + MARS_DBG("search for '%s' -> found %p\n", tmp, aio_brick); + if (!aio_brick) { + aio_brick = + make_all(global, + &device_aio_brick_type, + tmp, + tmp, + parent, + (const void *[]){}, + (const char *[]){}, + (const char *[]){}, + 0, + true); + if (!aio_brick) { + MARS_ERR("cannot access '%s'\n", tmp); + status = -EIO; + goto done; + } + } + rot->aio_brick = (void*)aio_brick; + + /* Fetch the actual logfile size + */ + output = aio_brick->outputs[0]; + status = output->ops->mars_get_info(output, &rot->aio_info); + if (status < 0) { + MARS_ERR("cannot get info on '%s'\n", tmp); + goto done; + } + MARS_DBG("logfile '%s' size = %lld\n", tmp, rot->aio_info.current_size); + + /* Fetch / make the transaction logger + */ + trans_brick = mars_find_brick(global, &trans_logger_brick_type, parent->d_path); + MARS_DBG("search for transaction logger '%s' -> found %p\n", parent->d_path, trans_brick); + if (!trans_brick) { + /* We deliberately "forget" to connect the log input here. + * Will be carried out later in make_log(). + * The final switch-on will be started in make_log_finalize(). + */ + trans_brick = + make_all(global, + &trans_logger_brick_type, + parent->d_path, + parent->d_path, + parent, + (const void *[]){&device_aio_brick_type}, + (const char *[]){"data-"}, + (const char *[]){my_id()}, + 1, + false); + status = -ENOENT; + if (!trans_brick) + goto done; + } + rot->trans_brick = (void*)trans_brick; + /* For safety, default is to try an (unnecessary) replay in case + * something goes wrong later. + */ + rot->do_replay = true; + + status = 0; + +done: + return status; +} + + +/* Internal helper. Return codes: + * ret < 0 : error + * ret == 0 : not relevant + * ret == 1 : relevant, no transaction replay + * ret == 2 : relevant for transaction replay + * ret == 3 : relevant for appending + */ +static +int _check_logging_status(struct mars_global *global, struct light_dent *dent, long long *oldpos, long long *newpos) +{ + struct light_dent *parent = dent->d_parent; + struct mars_rotate *rot = parent->d_private; + int status = -EINVAL; + + CHECK_PTR(rot, done); + + status = 0; + if (!rot->replay_link || !rot->aio_dent || !rot->aio_brick) { + //MARS_DBG("nothing to do on '%s'\n", dent->d_path); + goto done; + } + + if (rot->aio_dent->d_serial != dent->d_serial) { + //MARS_DBG("serial number %d not relevant\n", dent->d_serial); + goto done; + } + + if (sscanf(rot->replay_link->d_argv[1], "%lld", oldpos) != 1) { + MARS_ERR("bad position argument '%s'\n", rot->replay_link->d_argv[1]); + status = -EINVAL; + goto done; + } + + if (unlikely(rot->aio_info.current_size < *oldpos)) { + MARS_ERR("oops, bad replay position attempted in logfile '%s' (file length %lld should never be smaller than requested position %lld, is your filesystem corrupted?) => please repair this by hand\n", rot->aio_dent->d_path, rot->aio_info.current_size, *oldpos); + status = -EINVAL; + goto done; + } + + if (rot->aio_info.current_size > *oldpos) { + MARS_INF("transaction log replay is necessary on '%s' from %lld to %lld\n", rot->aio_dent->d_path, *oldpos, rot->aio_info.current_size); + *newpos = rot->aio_info.current_size; + status = 2; + } else if (rot->aio_info.current_size > 0) { + MARS_INF("transaction log '%s' is already applied (would be usable for appending at position %lld, but a fresh log is needed for safety reasons)\n", rot->aio_dent->d_path, *oldpos); + *newpos = rot->aio_info.current_size; + status = 1; + } else if (!rot->is_primary) { + MARS_INF("empty transaction log '%s' would be usable, but I am not primary\n", rot->aio_dent->d_path); + status = 0; + } else { + MARS_INF("empty transaction log '%s' is usable for me as a primary node\n", rot->aio_dent->d_path); + status = 3; + } + +done: + return status; +} + + +/* Note: this is strictly called in d_serial order. + * This is important! + */ +static +int make_log(void *buf, struct light_dent *dent) +{ + struct mars_global *global = buf; + struct light_dent *parent = dent->d_parent; + struct mars_rotate *rot = parent->d_private; + struct trans_logger_brick *trans_brick; + struct light_dent *prev_log; + loff_t start_pos = 0; + loff_t end_pos = 0; + int status = -EINVAL; + + CHECK_PTR(rot, err); + + status = 0; + trans_brick = rot->trans_brick; + if (!global->global_power.button || !dent->d_parent || !trans_brick || rot->has_error) { + goto done; + } + + if (dent->d_serial > rot->max_sequence) { + rot->max_sequence = dent->d_serial; + } + + /* Check for consecutiveness of logfiles + */ + prev_log = rot->next_log; + if (prev_log && prev_log->d_serial + 1 != dent->d_serial) { + MARS_ERR("transaction logs are not consecutive at '%s' (%d ~> %d)\n", dent->d_path, prev_log->d_serial, dent->d_serial); + status = -EINVAL; + goto done; + } + + /* Skip any logfiles after the relevant one. + * This should happen only when replaying multiple logfiles + * in sequence, or when starting a new logfile for writing. + */ + if (rot->relevant_log) { + if (rot->do_replay) { + status = 0; + goto ok; + } + if (rot->aio_dent->new_stat.size > 0) { + MARS_ERR("oops, the new logfile '%s' is not empty -- for safety, I will not use it -- log rotation is disabled now\n", dent->d_path); + status = -EINVAL; + goto done; + } + MARS_DBG("considering next logfile '%s' for rotation\n", dent->d_path); + } + + /* Find current logging status. + */ + status = _check_logging_status(global, dent, &start_pos, &end_pos); + if (status < 0) { + goto done; + } + /* Relevant or not? + */ + switch (status) { + case 0: // not relevant + goto ok; + case 1: // relevant, but transaction replay already finished + if (!trans_brick->power.button && !trans_brick->power.led_on && trans_brick->power.led_off) { + _update_link(rot, dent->d_parent, dent->d_serial + 1, 0); + mars_trigger(); + } + status = -EAGAIN; + goto done; + case 2: // relevant for transaction replay + MARS_DBG("replaying transaction log '%s' from %lld to %lld\n", dent->d_path, start_pos, end_pos); + rot->do_replay = true; + rot->start_pos = start_pos; + rot->end_pos = end_pos; + rot->relevant_log = dent; + break; + case 3: // relevant for appending + MARS_DBG("appending to transaction log '%s'\n", dent->d_path); + rot->do_replay = false; + rot->start_pos = 0; + rot->end_pos = 0; + rot->relevant_log = dent; + break; + default: + MARS_ERR("bad internal status %d\n", status); + status = -EINVAL; + goto done; + } + +ok: + /* All ok: switch over the indicators. + */ + rot->prev_log = rot->next_log; + rot->next_log = dent; + +done: + if (status < 0) { + rot->has_error = true; + } +err: + return status; +} + +static +int _start_trans(struct mars_rotate *rot) +{ + struct trans_logger_brick *trans_brick = rot->trans_brick; + int status = 0; + + if (trans_brick->power.button) { + goto done; + } + + /* Internal safety checks + */ + status = -EINVAL; + if (unlikely(!rot->aio_brick || !rot->relevant_log)) { + MARS_ERR("something is missing, this should not happen\n"); + goto done; + } + + /* For safety, disconnect old connection first + */ + if (trans_brick->inputs[1]->connect) { + (void)generic_disconnect((void*)trans_brick->inputs[1]); + } + + /* Connect to new transaction log + */ + status = generic_connect((void*)trans_brick->inputs[1], (void*)rot->aio_brick->outputs[0]); + if (status < 0) { + goto done; + } + + /* Supply all relevant parameters + */ + trans_brick->sequence = rot->relevant_log->d_serial; + trans_brick->do_replay = rot->do_replay; + trans_brick->current_pos = rot->start_pos; + trans_brick->start_pos = rot->start_pos; + trans_brick->end_pos = rot->end_pos; + + /* Switch on.... + */ + mars_power_button((void*)trans_brick, true); + status = trans_brick->ops->brick_switch(trans_brick); + MARS_DBG("status = %d\n", status); + +done: + return status; +} + +static +int _stop_trans(struct mars_rotate *rot) +{ + struct trans_logger_brick *trans_brick = rot->trans_brick; + int status = 0; + + if (!trans_brick->power.button) { + goto done; + } + + /* Switch off.... + */ + mars_power_button((void*)trans_brick, false); + status = trans_brick->ops->brick_switch(trans_brick); + MARS_DBG("status = %d\n", status); + if (status < 0) { + goto done; + } + + /* Disconnect old connection + */ + if (trans_brick->inputs[1]->connect && trans_brick->power.led_off) { + (void)generic_disconnect((void*)trans_brick->inputs[1]); + } + +done: + return status; +} + +static +int make_log_finalize(struct mars_global *global, struct light_dent *parent) +{ + struct mars_rotate *rot = parent->d_private; + struct trans_logger_brick *trans_brick; + int status = -EINVAL; + + CHECK_PTR(rot, done); + + trans_brick = rot->trans_brick; + + status = 0; + if (!trans_brick) { + MARS_DBG("nothing to do\n"); + goto done; + } + /* Stopping is also possible in case of errors + */ + if (trans_brick->power.button && trans_brick->power.led_on && !trans_brick->power.led_off) { + bool do_stop = + (rot->do_replay || trans_brick->do_replay) + ? (trans_brick->current_pos == trans_brick->end_pos) + : (rot->relevant_log && rot->relevant_log != rot->current_log); + MARS_DBG("do_stop = %d\n", (int)do_stop); + + if (do_stop || (long long)jiffies > rot->last_jiffies + 5 * HZ) { + status = _update_link(rot, parent, trans_brick->sequence, trans_brick->current_pos); + } + if (do_stop) { + status = _stop_trans(rot); + } + goto done; + } + /* Special case: after a fresh start, when no logfile exists, + * create one. This is a thin exception from the rule that + * normally userspace should control what happens in MARS. + */ + if (!rot->relevant_log && rot->is_primary && !rot->has_error && rot->max_sequence > 0 && !rot->create_once) { // try to create an empty logfile + char tmp[128] = {}; + snprintf(tmp, sizeof(tmp), "%s/log-%09d-%s", parent->d_path, rot->max_sequence + 1, my_id()); + _create_new_logfile(tmp); + rot->create_once = true; + msleep(1000); + goto done; + } + /* Starting is only possible when no error ocurred. + */ + if (!rot->relevant_log || rot->has_error) { + MARS_DBG("nothing to do\n"); + goto done; + } + + /* Start when necessary + */ + if (!trans_brick->power.button && !trans_brick->power.led_on && trans_brick->power.led_off) { + bool do_start = (!rot->do_replay || rot->start_pos != rot->end_pos); + MARS_DBG("do_start = %d\n", (int)do_start); + + if (do_start) { + status = _start_trans(rot); + rot->current_log = rot->relevant_log; + } + } else { + MARS_DBG("trans_brick %d %d %d\n", trans_brick->power.button, trans_brick->power.led_on, trans_brick->power.led_off); + } + +done: + return status; +} + +/////////////////////////////////////////////////////////////////////// + +// specific handlers + +static +int make_primary(void *buf, struct light_dent *dent) +{ + struct light_dent *parent = dent->d_parent; + struct mars_rotate *rot = parent->d_private; + int status = -EINVAL; + + CHECK_PTR(rot, done); + + rot->is_primary = (dent->new_link && !strcmp(dent->new_link, my_id())); + status = 0; + +done: + return status; +} + +static +int make_aio(void *buf, struct light_dent *dent) +{ + struct mars_global *global = buf; + struct mars_brick *brick; + struct device_aio_brick *_brick; + int status; + + if (!global->global_power.button) { + return 0; + } + if (mars_find_brick(global, &device_aio_brick_type, dent->d_path)) { + return 0; + } + brick = make_brick(global, &device_aio_brick_type, dent->d_path, dent->d_path); + if (!brick) + return -1; + brick->outputs[0]->output_name = dent->d_path; + mars_power_button((void*)brick, true); + _brick = (void*)brick; + _brick->outputs[0]->o_fdsync = true; + status = brick->ops->brick_switch(brick); + if (status < 0) { + kill_default(buf, dent); + } + return status; +} + +static int make_dev(void *buf, struct light_dent *dent) +{ + struct mars_global *global = buf; + struct light_dent *parent = dent->d_parent; + struct mars_rotate *rot = parent->d_private; + struct mars_brick *dev_brick; + int status = 0; + + if (!global->global_power.button || !dent->d_parent || !dent->new_link) { + MARS_DBG("nothing to do\n"); + goto done; + } + +#ifdef USE_TRANS_LOGGER + status = make_log_finalize(global, dent->d_parent); + if (status < 0) { + MARS_DBG("logger not initialized\n"); + goto done; + } + if (!rot || !rot->is_primary) { + MARS_DBG("I am not primary, don't show the device\n"); + goto done; + } + if (!rot->trans_brick || rot->trans_brick->do_replay || !rot->trans_brick->power.led_on || rot->trans_brick->power.led_off) { + MARS_DBG("transaction logger not ready for writing\n"); + goto done; + } +#endif + + status = _parse_args(dent, dent->new_link, 1); + if (status < 0) { + goto done; + } + dev_brick = mars_find_brick(global, &if_device_brick_type, dent->d_path); + MARS_DBG("search for '%s' -> found %p\n", dent->d_path, dev_brick); + if (!dev_brick) { + dev_brick = + make_all(global, + &if_device_brick_type, + dent->d_path, + dent->d_argv[0], + dent->d_parent, +#ifdef USE_TRANS_LOGGER + (const void *[]){&trans_logger_brick_type}, + (const char *[]){dent->d_parent->d_path}, + (const char *[]){""}, +#else // direct connection, ONLY FOR TESTING!!! + (const void *[]){&device_aio_brick_type}, + (const char *[]){"data-"}, + (const char *[]){my_id()}, +#endif + 1, + true); + if (!dev_brick) + return -1; + } + +done: + return status; +} + +static int _make_direct(void *buf, struct light_dent *dent) +{ + struct mars_global *global = buf; + struct mars_brick *brick; + int status; + + if (!global->global_power.button || !dent->d_parent || !dent->new_link) { + return 0; + } + status = _parse_args(dent, dent->new_link, 2); + if (status < 0) { + goto done; + } + brick = mars_find_brick(global, &client_brick_type, dent->d_argv[0]); + MARS_DBG("search for '%s' -> found %p\n", dent->d_argv[0], brick); + if (!brick) { + brick = make_all(global, + &client_brick_type, + dent->d_argv[0], + dent->d_argv[0], + dent->d_parent, + (const void *[]){}, + (const char *[]){}, + (const char *[]){}, + 0, + true); + status = -1; + if (!brick) + goto done; + } + brick = mars_find_brick(global, &if_device_brick_type, dent->d_path); + MARS_DBG("search for '%s' -> found %p\n", dent->d_path, brick); + if (!brick) { + brick = make_all(global, + &if_device_brick_type, + dent->d_path, + dent->d_argv[1], + dent->d_parent, + (const void *[]){&client_brick_type}, + (const char *[]){dent->d_argv[0]}, + (const char *[]){""}, + 1, + true); + status = -1; + if (!brick) + goto done; + } + status = 0; +done: + MARS_DBG("status = %d\n", status); + return status; +} + +static +int __make_copy(struct mars_global *global, struct light_dent *parent, char *path, char *argv[]) +{ + char tmp[128]; + char *new_argv[4]; + struct mars_brick *copy; + struct copy_brick *_copy; + struct mars_output *output[2] = {}; + struct mars_info info[2] = {}; + int i; + int status = -1; + + for (i = 0; i < 2; i++) { + char *target = argv[i]; + struct mars_brick *new = NULL; + + new_argv[i * 2] = target; + new_argv[i * 2 + 1] = target; + if (*target == '/') { // local + new = mars_find_brick(global, &device_aio_brick_type, target); + MARS_DBG("search for local '%s' -> found %p\n", target, new); + } else { // remote + new = mars_find_brick(global, &client_brick_type, target); + MARS_DBG("search for remote '%s' -> found %p\n", target, new); + if (!new) { + snprintf(tmp, sizeof(tmp), "%s_copy", target); + new_argv[i * 2 + 1] = tmp; + /* 1st client instance is for data IO + */ + new = make_all(global, + &client_brick_type, + target, + target, + parent, + (const void *[]){}, + (const char *[]){}, + (const char *[]){}, + 0, + true); + if (!new) { + MARS_DBG("cannot instantiate\n"); + goto done; + } + /* 2nd client instance is for background copy IO + */ + new = make_all(global, + &client_brick_type, + tmp, + target, + parent, + (const void *[]){}, + (const char *[]){}, + (const char *[]){}, + 0, + true); + } + } + if (!new) { + MARS_DBG("cannot instantiate\n"); + goto done; + } + output[i] = new->outputs[0]; + } + + copy = mars_find_brick(global, ©_brick_type, path); + MARS_DBG("search for copy brick '%s' -> found %p\n", path, copy); + if (!copy) { + copy = make_all(global, + ©_brick_type, + path, + path, + parent, + (const void *[]){NULL,NULL,NULL,NULL}, + (const char *[]){new_argv[0],new_argv[1],new_argv[2],new_argv[3]}, + (const char *[]){"","","",""}, + 4, false); + MARS_DBG("copy brick = %p\n", copy); + if (!copy) + goto done; + + } + + /* Determine the copy area + */ + for (i = 0; i < 2; i++) { + status = output[i]->ops->mars_get_info(output[i], &info[i]); + if (status < 0) { + MARS_ERR("cannot determine current size of\n"); + goto done; + } + } + _copy = (void*)copy; + _copy->copy_start = info[1].current_size; + MARS_DBG("copy_start = %lld\n", _copy->copy_start); + _copy->copy_end = info[0].current_size; + MARS_DBG("copy_end = %lld\n", _copy->copy_end); + mars_power_button((void*)copy, true); + status = copy->ops->brick_switch(copy); + MARS_DBG("copy switch status = %d\n", status); + status = 0; + +done: + MARS_DBG("status = %d\n", status); + return status; +} + +static int _make_copy(void *buf, struct light_dent *dent) +{ + struct mars_global *global = buf; + int status; + + if (!global->global_power.button || !dent->d_parent || !dent->new_link) { + return 0; + } + status = _parse_args(dent, dent->new_link, 2); + if (status < 0) { + goto done; + } + + status = __make_copy(global, dent->d_parent, dent->d_path, dent->d_argv); + +done: + MARS_DBG("status = %d\n", status); + return status; +} + +/////////////////////////////////////////////////////////////////////// + +// the order is important! +enum { + CL_ROOT, // root element: this must have index 0 + CL_IPS, + CL_PEERS, + CL_RESOURCE, + CL_DATA, + CL_PRIMARY, + CL__FILE, + CL_SYNC, + CL__COPY, + CL__REMOTE, + CL__DIRECT, + CL_REPLAYSTATUS, + CL_LOG, + CL_DEVICE, + CL_CONNECT, +}; + +/* Please keep the order the same as in the enum. + */ +static const struct light_class light_classes[] = { + /* Placeholder for root node /mars/ + */ + [CL_ROOT] = { + }, + + /* Directory containing the addresses of all peers + */ + [CL_IPS] = { + .cl_name = "ips", + .cl_len = 3, + .cl_type = 'd', + .cl_father = CL_ROOT, +#if 1 + .cl_forward = make_scan, + .cl_backward = kill_scan, +#endif + }, + /* Anyone participating in a MARS cluster must + * be named here (symlink pointing to the IP address). + * We have no DNS in kernel space. + */ + [CL_PEERS] = { + .cl_name = "ip-", + .cl_len = 3, + .cl_type = 'l', + .cl_father = CL_IPS, + .cl_forward = NULL, + .cl_backward = NULL, + }, + + /* Directory containing all items of a resource + */ + [CL_RESOURCE] = { + .cl_name = "resource-", + .cl_len = 9, + .cl_type = 'd', + .cl_father = CL_ROOT, + .cl_forward = make_log_init, + .cl_backward = NULL, + }, + /* File or symlink to the real device / real (sparse) file + * when hostcontext is missing, the corresponding peer will + * not participate in that resource. + */ + [CL_DATA] = { + .cl_name = "data-", + .cl_len = 5, + .cl_type = 'F', + .cl_hostcontext = true, + .cl_father = CL_RESOURCE, + .cl_forward = make_aio, + .cl_backward = kill_default, + }, + /* Symlink pointing to the name of the primary node + */ + [CL_PRIMARY] = { + .cl_name = "primary", + .cl_len = 7, + .cl_type = 'l', + .cl_hostcontext = false, + .cl_father = CL_RESOURCE, + .cl_forward = make_primary, + .cl_backward = NULL, + }, + /* Only for testing: open local file + */ + [CL__FILE] = { + .cl_name = "_file-", + .cl_len = 6, + .cl_type = 'F', + .cl_serial = true, + .cl_hostcontext = true, + .cl_father = CL_RESOURCE, + .cl_forward = make_aio, + .cl_backward = kill_default, + }, + /* symlink indicating the current status / end + * of initial data sync. + */ + [CL_SYNC] = { + .cl_name = "syncstatus-", + .cl_len = 11, + .cl_type = 'l', + .cl_hostcontext = true, + .cl_father = CL_RESOURCE, +#if 0 + .cl_forward = make_sync, + .cl_backward = kill_sync, +#endif + }, + /* Only for testing: make a copy instance + */ + [CL__COPY] = { + .cl_name = "_copy-", + .cl_len = 6, + .cl_type = 'l', + .cl_serial = true, + .cl_hostcontext = true, + .cl_father = CL_RESOURCE, + .cl_forward = _make_copy, + .cl_backward = kill_all, + }, + /* Only for testing: access remote data directly + */ + [CL__REMOTE] = { + .cl_name = "_remote-", + .cl_len = 8, + .cl_type = 'l', + .cl_serial = true, + .cl_hostcontext = true, + .cl_father = CL_RESOURCE, + .cl_forward = _make_remote, + .cl_backward = _kill_remote, + }, + /* Only for testing: access local data + */ + [CL__DIRECT] = { + .cl_name = "_direct-", + .cl_len = 8, + .cl_type = 'l', + .cl_serial = true, + .cl_hostcontext = true, + .cl_father = CL_RESOURCE, + .cl_forward = _make_direct, + .cl_backward = kill_all, + }, + + /* Passive symlink indicating the last state of + * transaction log replay. + */ + [CL_REPLAYSTATUS] = { + .cl_name = "replay-", + .cl_len = 7, + .cl_type = 'l', + .cl_hostcontext = true, + .cl_father = CL_RESOURCE, + .cl_forward = NULL, + .cl_backward = NULL, + }, + /* Logfiles for transaction logger + */ + [CL_LOG] = { + .cl_name = "log-", + .cl_len = 4, + .cl_type = 'F', + .cl_serial = true, + .cl_hostcontext = true, + .cl_father = CL_RESOURCE, +#if 1 + .cl_forward = make_log, + .cl_backward = kill_all, +#endif + }, + + /* Name of the device appearing at the primary + */ + [CL_DEVICE] = { + .cl_name = "device-", + .cl_len = 7, + .cl_type = 'l', + .cl_hostcontext = true, + .cl_father = CL_RESOURCE, + .cl_forward = make_dev, + .cl_backward = kill_default, + }, + /* Symlink indicating the current peer + */ + [CL_CONNECT] = { + .cl_name = "connect-", + .cl_len = 8, + .cl_type = 'l', + .cl_hostcontext = true, + .cl_father = CL_RESOURCE, +#if 0 + .cl_forward = make_connect, + .cl_backward = kill_connect, +#endif + }, + {} +}; + +/* Helper routine to pre-determine the relevance of a name from the filesystem. + */ +static int light_checker(const char *path, const char *_name, int namlen, unsigned int d_type, int *prefix, int *serial) +{ + char name[namlen+1]; + int class; + memcpy(name, _name, namlen); + name[namlen] = '\0'; + + //MARS_DBG("trying '%s' '%s'\n", path, name); + for (class = CL_ROOT + 1; ; class++) { + const struct light_class *test = &light_classes[class]; + int len = test->cl_len; + if (!len || !test->cl_name) + break; + //MARS_DBG(" testing class '%s'\n", test->cl_name); +#if 0 + if (len != strlen(test->cl_name)) { + MARS_ERR("internal table '%s': %d != %d\n", test->cl_name, len, (int)strlen(test->cl_name)); + len = strlen(test->cl_name); + } +#endif + if (namlen >= len && !memcmp(name, test->cl_name, len)) { + //MARS_DBG("path '%s/%s' matches class %d '%s'\n", path, name, class, test->cl_name); + // special contexts + if (test->cl_serial) { + int plus = 0; + int count; + count = sscanf(name+len, "%d%n", serial, &plus); + if (count < 1) { + //MARS_DBG("'%s' serial number mismatch at '%s'\n", name, name+len); + return -1; + } + len += plus; + if (name[len] == '-') + len++; + } + *prefix = len; + if (test->cl_hostcontext) { + if (memcmp(name+len, my_id(), namlen-len)) { + //MARS_DBG("context mismatch '%s' at '%s'\n", name, name+len); + return -1; + } + } + return class; + } + } + //MARS_DBG("no match for '%s' '%s'\n", path, name); + return -2; +} + +/* Do some syntactic checks, then delegate work to the real worker functions + * from the light_classes[] table. + */ +static int light_worker(struct mars_global *global, struct mars_dent *dent, bool direction) +{ + light_worker_fn worker; + int class = dent->d_class; + + if (class < 0 || class >= sizeof(light_classes)/sizeof(struct light_class)) { + MARS_ERR_ONCE(dent, "bad internal class %d of '%s'\n", class, dent->d_path); + return -EINVAL; + } + switch (light_classes[class].cl_type) { + case 'd': + if (!S_ISDIR(dent->new_stat.mode)) { + MARS_ERR_ONCE(dent, "'%s' should be a directory, but is something else\n", dent->d_path); + return -EINVAL; + } + break; + case 'f': + if (!S_ISREG(dent->new_stat.mode)) { + MARS_ERR_ONCE(dent, "'%s' should be a regular file, but is something else\n", dent->d_path); + return -EINVAL; + } + break; + case 'F': + if (!S_ISREG(dent->new_stat.mode) && !S_ISLNK(dent->new_stat.mode)) { + MARS_ERR_ONCE(dent, "'%s' should be a regular file or a symlink, but is something else\n", dent->d_path); + return -EINVAL; + } + break; + case 'l': + if (!S_ISLNK(dent->new_stat.mode)) { + MARS_ERR_ONCE(dent, "'%s' should be a symlink, but is something else\n", dent->d_path); + return -EINVAL; + } + break; + } + if (likely(class > CL_ROOT)) { + int father = light_classes[class].cl_father; + if (father == CL_ROOT) { + if (unlikely(dent->d_parent)) { + MARS_ERR_ONCE(dent, "'%s' is not at the root of the hierarchy\n", dent->d_path); + return -EINVAL; + } + } else if (unlikely(!dent->d_parent || dent->d_parent->d_class != father)) { + MARS_ERR_ONCE(dent, "last component '%s' from '%s' is at the wrong position in the hierarchy (class = %d, parent_class = %d, parent = '%s')\n", dent->d_name, dent->d_path, father, dent->d_parent ? dent->d_parent->d_class : -9999, dent->d_parent ? dent->d_parent->d_path : ""); + return -EINVAL; + } + } + if (direction) { + worker = light_classes[class].cl_backward; + } else { + worker = light_classes[class].cl_forward; + } + if (worker) { + int status; + //MARS_DBG("working %s on '%s' rest='%s'\n", direction ? "backward" : "forward", dent->d_path, dent->d_rest); + status = worker(global, (void*)dent); + MARS_DBG("worked %s on '%s', status = %d\n", direction ? "backward" : "forward", dent->d_path, status); + return status; + } + return 0; +} + +static int light_thread(void *data) +{ + char *id = my_id(); + int status = 0; + struct mars_global global = { + .dent_anchor = LIST_HEAD_INIT(global.dent_anchor), + .brick_anchor = LIST_HEAD_INIT(global.brick_anchor), + .global_power = { + .button = true, + }, + .mutex = __SEMAPHORE_INITIALIZER(global.mutex, 1), + .main_event = __WAIT_QUEUE_HEAD_INITIALIZER(global.main_event), + }; + mars_global = &global; // TODO: cleanup, avoid stack + + if (!id || strlen(id) < 2) { + MARS_ERR("invalid hostname\n"); + status = -EFAULT; + goto done; + } + + fake_mm(); + + MARS_INF("-------- starting as host '%s' ----------\n", id); + + while (global.global_power.button || !list_empty(&global.brick_anchor)) { + int status; + global.global_power.button = !kthread_should_stop(); + + status = mars_dent_work(&global, "/mars", sizeof(struct light_dent), light_checker, light_worker, &global, 3); + MARS_DBG("worker status = %d\n", status); + + wait_event_interruptible_timeout(global.main_event, global.main_trigger, 30 * HZ); + global.main_trigger = false; + +#ifdef MARS_DEBUGGING + { + struct list_head *tmp; + int dent_count = 0; + int brick_count = 0; + + down(&global.mutex); + MARS_IO("----------- lists:\n"); + for (tmp = global.dent_anchor.next; tmp != &global.dent_anchor; tmp = tmp->next) { + struct mars_dent *dent; + dent = container_of(tmp, struct mars_dent, sub_link); + MARS_IO("dent '%s'\n", dent->d_path); + dent_count++; + } + for (tmp = global.brick_anchor.next; tmp != &global.brick_anchor; tmp = tmp->next) { + struct mars_brick *test; + test = container_of(tmp, struct mars_brick, brick_link); + MARS_IO("brick path = '%s' name = '%s' button = %d on = %d off = %d\n", test->brick_path, test->brick_name, test->power.button, test->power.led_on, test->power.led_off); + brick_count++; + } + up(&global.mutex); + + MARS_INF("----------- STATISTICS: %d dents, %d bricks\n", dent_count, brick_count); + } + msleep(500); +#endif + } + +done: + MARS_INF("-------- cleaning up ----------\n"); + + mars_dent_free_all(&global.dent_anchor); + + cleanup_mm(); + + mars_global = NULL; + main_thread = NULL; + + MARS_INF("-------- done status = %d ----------\n", status); + return status; +} + +static void __exit exit_light(void) +{ + // TODO: make this thread-safe. + struct task_struct *thread = main_thread; + if (thread) { + main_thread = NULL; + MARS_DBG("====================== stopping everything...\n"); + kthread_stop_nowait(thread); + mars_trigger(); + kthread_stop(thread); + put_task_struct(thread); + MARS_DBG("====================== stopped everything.\n"); + } +} + +static int __init init_light(void) +{ + struct task_struct *thread; + thread = kthread_create(light_thread, NULL, "mars_light"); + if (IS_ERR(thread)) { + return PTR_ERR(thread); + } + get_task_struct(thread); + main_thread = thread; + wake_up_process(thread); + return 0; +} + +MODULE_DESCRIPTION("MARS Light"); +MODULE_AUTHOR("Thomas Schoebel-Theuer "); +MODULE_LICENSE("GPL"); + +module_init(init_light); +module_exit(exit_light); diff --git a/mars_net.c b/mars_net.c new file mode 100644 index 00000000..d86d7cb9 --- /dev/null +++ b/mars_net.c @@ -0,0 +1,684 @@ +// (c) 2011 Thomas Schoebel-Theuer / 1&1 Internet AG + +//#define BRICK_DEBUGGING +//#define MARS_DEBUGGING +//#define IO_DEBUGGING + +#ifdef IO_DEBUGGING +#define MARS_IO MARS_DBG +#else +#define MARS_IO(args...) /*empty*/ +#endif + +#include +#include +#include + +#define _STRATEGY +#include "mars.h" +#include "mars_net.h" + +/* Low-level network traffic + */ + +/* TODO: allow binding to specific source addresses instead of catch-all. + * TODO: make all the socket options configurable. + * TODO: implement signal handling. + * TODO: add authentication. + * TODO: add compression / encryption. + */ + +struct mars_tcp_params default_tcp_params = { + .window_size = 8 * 1024 * 1024, // for long distance replications + .tcp_timeout = 20, + .tcp_keepcnt = 6, + .tcp_keepintvl = 10, // keepalive ping time + .tcp_keepidle = 10, + .tos = IPTOS_LOWDELAY, +}; +EXPORT_SYMBOL(default_tcp_params); + +static void _check(int status) +{ + if (status < 0) { + MARS_ERR("cannot set socket option, status = %d\n", status); + } +} + +int mars_create_sockaddr(struct sockaddr_storage *addr, const char *spec) +{ + struct sockaddr_in *sockaddr = (void*)addr; + int status; + memset(addr, sizeof(*addr), 0); + sockaddr->sin_family = AF_INET; + sockaddr->sin_port = htons(MARS_DEFAULT_PORT); + /* This is PROVISIONARY! + * TODO: add IPV6 syntax and many more features :) + */ + if (!*spec) + return 0; + if (*spec != ':') { + unsigned char u0 = 0, u1 = 0, u2 = 0, u3 = 0; + status = sscanf(spec, "%hhu.%hhu.%hhu.%hhu", &u0, &u1, &u2, &u3); + if (status != 4) + return -EINVAL; + sockaddr->sin_addr.s_addr = (__be32)u0 | (__be32)u1 << 8 | (__be32)u2 << 16 | (__be32)u3 << 24; + } + while (*spec && *spec++ != ':') + /*empty*/; + if (*spec) { + int port = 0; + status = sscanf(spec, "%d", &port); + if (status != 1) + return -EINVAL; + sockaddr->sin_port = htons(port); + } + return 0; +} +EXPORT_SYMBOL_GPL(mars_create_sockaddr); + +int mars_create_socket(struct socket **sock, struct sockaddr_storage *addr, bool is_server) +{ + struct sockaddr null_bind = {}; + struct sockaddr *sockaddr = (void*)addr; + int x_true = 1; + int status; + + if (!is_server) { + sockaddr = &null_bind; + } + + status = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, sock); + if (status < 0) { + *sock = NULL; + MARS_ERR("cannot create socket, status = %d\n", status); + return status; + } + + /* TODO: improve this by a table-driven approach + */ + (*sock)->sk->sk_rcvtimeo = (*sock)->sk->sk_sndtimeo = default_tcp_params.tcp_timeout * HZ; + status = kernel_setsockopt(*sock, SOL_SOCKET, SO_SNDBUF, (char*)&default_tcp_params.window_size, sizeof(default_tcp_params.window_size)); + _check(status); + status = kernel_setsockopt(*sock, SOL_SOCKET, SO_RCVBUF, (char*)&default_tcp_params.window_size, sizeof(default_tcp_params.window_size)); + _check(status); + status = kernel_setsockopt(*sock, SOL_IP, SO_PRIORITY, (char*)&default_tcp_params.tos, sizeof(default_tcp_params.tos)); + _check(status); + status = kernel_setsockopt(*sock, IPPROTO_TCP, TCP_NODELAY, (char*)&x_true, sizeof(x_true)); + _check(status); + status = kernel_setsockopt(*sock, SOL_SOCKET, SO_KEEPALIVE, (char*)&x_true, sizeof(x_true)); + _check(status); + status = kernel_setsockopt(*sock, IPPROTO_TCP, TCP_KEEPCNT, (char*)&default_tcp_params.tcp_keepcnt, sizeof(default_tcp_params.tcp_keepcnt)); + _check(status); + status = kernel_setsockopt(*sock, IPPROTO_TCP, TCP_KEEPINTVL, (char*)&default_tcp_params.tcp_keepintvl, sizeof(default_tcp_params.tcp_keepintvl)); + _check(status); + status = kernel_setsockopt(*sock, IPPROTO_TCP, TCP_KEEPIDLE, (char*)&default_tcp_params.tcp_keepidle, sizeof(default_tcp_params.tcp_keepidle)); + _check(status); + + status = kernel_bind(*sock, sockaddr, sizeof(*sockaddr)); + if (status < 0) { + MARS_ERR("bind failed, status = %d\n", status); + return status; + } + + if (!is_server) { + sockaddr = (void*)addr; + status = kernel_connect(*sock, sockaddr, sizeof(*sockaddr), 0); + if (status < 0) { + MARS_ERR("connect failed, status = %d\n", status); + } + } + + return status; +} +EXPORT_SYMBOL_GPL(mars_create_socket); + +int mars_send(struct socket **sock, void *buf, int len) +{ + struct kvec iov = { + .iov_base = buf, + .iov_len = len, + }; + struct msghdr msg = { + .msg_iov = (struct iovec*)&iov, + .msg_flags = 0 /* | MSG_NOSIGNAL*/, + }; + int status = -EIDRM; + int sent = 0; + + //MARS_IO("buf = %p, len = %d\n", buf, len); + while (sent < len) { + mm_segment_t oldfs; + + if (unlikely(!*sock)) { + MARS_ERR("socket has disappeared\n"); + status = -EIDRM; + goto done; + } + + oldfs = get_fs(); + set_fs(get_ds()); + status = kernel_sendmsg(*sock, &msg, &iov, 1, len); + set_fs(oldfs); + + if (status == -EAGAIN) { + msleep(50); + continue; + } + + if (status == -EINTR) { // ignore it + flush_signals(current); + msleep(50); + continue; + } + + if (status < 0) { + MARS_ERR("bad socket sendmsg, len=%d, iov_len=%d, sent=%d, status = %d\n", len, (int)iov.iov_len, sent, status); + goto done; + } + + if (!status) { + MARS_ERR("EOF from socket upon sendmsg\n"); + status = -ECOMM; + goto done; + } + + iov.iov_base += status; + iov.iov_len -= status; + sent += status; + } + status = sent; +done: + return status; +} +EXPORT_SYMBOL_GPL(mars_send); + +int mars_recv(struct socket **sock, void *buf, int minlen, int maxlen) +{ + int status = -EIDRM; + int done = 0; + + if (!buf) { + MARS_ERR("bad receive buffer\n"); + return -EINVAL; + } + + while (done < minlen) { + mm_segment_t oldfs; + struct kvec iov = { + .iov_base = buf + done, + .iov_len = maxlen - done, + }; + struct msghdr msg = { + .msg_iovlen = 1, + .msg_iov = (struct iovec*)&iov, + .msg_flags = 0 | MSG_WAITALL /*| MSG_NOSIGNAL*/, + }; + + if (unlikely(!*sock)) { + MARS_ERR("socket has disappeared\n"); + status = -EIDRM; + goto err; + } + + MARS_IO("done %d, fetching %d bytes\n", done, maxlen-done); + + oldfs = get_fs(); + set_fs(get_ds()); + status = kernel_recvmsg(*sock, &msg, &iov, 1, maxlen-done, msg.msg_flags); + set_fs(oldfs); + + if (status == -EAGAIN) { +#if 0 + if (!done) + goto err; +#endif + msleep(50); + continue; + } + if (!status) { // EOF + MARS_ERR("got EOF (done=%d, req_size=%d)\n", done, maxlen-done); + status = -EPIPE; + goto err; + } + if (status < 0) { + MARS_ERR("bad recvmsg, status = %d\n", status); + goto err; + } + done += status; + } + status = done; + +err: + return status; +} +EXPORT_SYMBOL_GPL(mars_recv); + +/////////////////////////////////////////////////////////////////////// + +/* Mid-level field data exchange + */ + +/* TODO: make this bytesex-aware + */ +#define MARS_NET_MAGIC 0x63f092ec6048f48cll + +struct mars_net_header { + u64 h_magic; + char h_name[MAX_FIELD_LEN]; + u16 h_seq; + u16 h_len; +}; + +int _mars_send_struct(struct socket **sock, void *data, const struct meta *meta, int *seq) +{ + int count = 0; + int status = 0; + if (!data) { // send EOF + struct mars_net_header header = { + .h_magic = MARS_NET_MAGIC, + .h_seq = -1, + }; + return mars_send(sock, &header, sizeof(header)); + } + for (; ; meta++) { + struct mars_net_header header = { + .h_magic = MARS_NET_MAGIC, + .h_seq = ++(*seq), + }; + void *item = data + meta->field_offset; + int len = meta->field_size; +#if 1 + if (len > 16 * PAGE_SIZE) { + MARS_ERR("implausible len=%d, \n", len); + msleep(30000); + status = -EINVAL; + break; + } +#endif + + /* Automatically keep the lamport clock correct. + */ + if (meta == mars_cmd_meta) { + struct timespec *stamp = &((struct mars_cmd*)data)->cmd_stamp; + get_lamport(stamp); + } else if (meta == mars_timespec_meta) { + set_lamport(data); + } + + status = 0; + switch (meta->field_type) { + case FIELD_STRING: + item = *(void**)item; + len = 0; + if (item) + len = strlen(item); + break; + case FIELD_REF: + if (!meta->field_ref) { + MARS_ERR("improper FIELD_REF definition\n"); + status = -EINVAL; + break; + } + item = *(void**)item; + len = meta->field_ref->field_size; + if (!item) + len = 0; + break; + case FIELD_DONE: + len = 0; + case FIELD_SUB: + case FIELD_RAW: + case FIELD_INT: + case FIELD_UINT: + // all ok + break; + default: + MARS_ERR("invalid field type %d\n", meta->field_type); + status = -EINVAL; + break; + } + if (status < 0) + break; + + header.h_len = len; + strncpy(header.h_name, meta->field_name, MAX_FIELD_LEN); + + MARS_IO("sending header %d '%s' len = %d\n", header.h_seq, meta->field_name, len); + status = mars_send(sock, &header, sizeof(header)); + if (status < 0 || !meta->field_name[0]) { // EOR + break; + } + + switch (meta->field_type) { + case FIELD_REF: + case FIELD_SUB: + status = _mars_send_struct(sock, item, meta->field_ref, seq); + if (status > 0) + count += status; + break; + default: + if (len > 0) { + MARS_IO("sending extra %d\n", len); + status = mars_send(sock, item, len); + if (status > 0) + count++; + } + } + + if (status < 0) { + break; + } + } + if (status >= 0) + status = count; + return status; +} + +int mars_send_struct(struct socket **sock, void *data, const struct meta *meta) +{ + int seq = 0; + return _mars_send_struct(sock, data, meta, &seq); +} +EXPORT_SYMBOL_GPL(mars_send_struct); + +int _mars_recv_struct(struct socket **sock, void *data, const struct meta *meta, int *seq) +{ + int count = 0; + int status = -EINVAL; + + //MARS_IO("\n"); + if (!data) { + goto done; + } + for (;;) { + struct mars_net_header header = {}; + const struct meta *tmp; + void *item; + void *mem; + status = mars_recv(sock, &header, sizeof(header), sizeof(header)); + if (status == -EAGAIN) { + msleep(50); + continue; + } + if (status < 0) { + MARS_ERR("status = %d\n", status); + break; + } + MARS_IO("got header %d '%s' len = %d\n", header.h_seq, header.h_name, header.h_len); + if (header.h_magic != MARS_NET_MAGIC) { + MARS_ERR("bad packet header magic = %llx\n", header.h_magic); + status = -ENOMSG; + break; + } + if (header.h_seq == -1) { // got EOF + status = 0; + break; + }; + if (header.h_seq <= *seq) { + MARS_ERR("unexpected packet data, seq=%d (expected=%d)\n", header.h_seq, (*seq) + 1); + status = -ENOMSG; + break; + } + *seq = header.h_seq; + + if (!header.h_name[0]) { // end of record (EOR) + status = 0; + break; + } + + tmp = find_meta(meta, header.h_name); + if (!tmp) { + MARS_ERR("unknown field '%s'\n", header.h_name); + if (header.h_len > 0) { // try to continue by skipping the rest of data + void *dummy = kmalloc(header.h_len, GFP_MARS); + status = -ENOMEM; + if (!dummy) + break; + status = mars_recv(sock, dummy, header.h_len, header.h_len); + kfree(dummy); + if (status < 0) + break; + } + continue; + } + + status = 0; + item = data + tmp->field_offset; + switch (tmp->field_type) { + case FIELD_REF: + case FIELD_STRING: + if (header.h_len <= 0) { + mem = NULL; + } else { + mem = kzalloc(header.h_len + 1, GFP_MARS); + if (!mem) { + status = -ENOMEM; + goto done; + } + } + *(void**)item = mem; + item = mem; + break; + } + + switch (tmp->field_type) { + case FIELD_REF: + case FIELD_SUB: + if (!item) { + MARS_ERR("bad item\n"); + status = -EINVAL; + break; + } + + MARS_IO("starting recursive structure\n"); + status = _mars_recv_struct(sock, item, tmp->field_ref, seq); + MARS_IO("ending recursive structure, status = %d\n", status); + + if (status > 0) + count += status; + break; + default: + if (header.h_len > 0) { + if (!item) { + MARS_ERR("bad item\n"); + status = -EINVAL; + break; + } + MARS_IO("reading extra %d\n", header.h_len); + status = mars_recv(sock, item, header.h_len, header.h_len); + while (status == -EAGAIN) { + msleep(50); + status = mars_recv(sock, item, header.h_len, header.h_len); + } + if (status >= 0) { + //MARS_IO("got data len = %d status = %d\n", header.h_len, status); + count++; + } else { + MARS_ERR("len = %d, status = %d\n", header.h_len, status); + } + } + } + if (status < 0) + break; + } +done: + if (status >= 0) { + status = count; + if (meta == mars_timespec_meta) + set_lamport(data); + } else { + MARS_ERR("status = %d\n", status); + } + return status; +} + +int mars_recv_struct(struct socket **sock, void *data, const struct meta *meta) +{ + int seq = 0; + return _mars_recv_struct(sock, data, meta, &seq); +} +EXPORT_SYMBOL_GPL(mars_recv_struct); + +/////////////////////////////////////////////////////////////////////// + +/* High-level transport of mars structures + */ + +const struct meta mars_cmd_meta[] = { + META_INI_SUB(cmd_stamp, struct mars_cmd, mars_timespec_meta), + META_INI(cmd_code, struct mars_cmd, FIELD_INT), + META_INI(cmd_int1, struct mars_cmd, FIELD_INT), + META_INI(cmd_str1, struct mars_cmd, FIELD_STRING), + {} +}; +EXPORT_SYMBOL_GPL(mars_cmd_meta); + +int mars_send_dent_list(struct socket **sock, struct list_head *anchor) +{ + struct list_head *tmp; + struct mars_dent *dent; + int status = 0; + for (tmp = anchor->next; tmp != anchor; tmp = tmp->next) { + dent = container_of(tmp, struct mars_dent, sub_link); + status = mars_send_struct(sock, dent, mars_dent_meta); + if (status < 0) + break; + } + if (status >= 0) { // send EOF + status = mars_send_struct(sock, NULL, mars_dent_meta); + } + return status; +} +EXPORT_SYMBOL_GPL(mars_send_dent_list); + +int mars_recv_dent_list(struct socket **sock, struct list_head *anchor) +{ + int status; + for (;;) { + struct mars_dent *dent = kzalloc(sizeof(struct mars_dent), GFP_MARS); + if (!dent) + return -ENOMEM; + + //MARS_IO("\n"); + + status = mars_recv_struct(sock, dent, mars_dent_meta); + if (status <= 0) { + kfree(dent); + goto done; + } + list_add_tail(&dent->sub_link, anchor); + } +done: + return status; +} +EXPORT_SYMBOL_GPL(mars_recv_dent_list); + + +int mars_send_mref(struct socket **sock, struct mref_object *mref) +{ + struct mars_cmd cmd = { + .cmd_code = CMD_MREF, + .cmd_int1 = mref->ref_id, + }; + int status; + + status = mars_send_struct(sock, &cmd, mars_cmd_meta); + if (status < 0) + goto done; + + status = mars_send_struct(sock, mref, mars_mref_meta); + if (status < 0) + goto done; + + if (mref->ref_rw) { + status = mars_send(sock, mref->ref_data, mref->ref_len); + } +done: + return status; +} +EXPORT_SYMBOL_GPL(mars_send_mref); + +int mars_recv_mref(struct socket **sock, struct mref_object *mref) +{ + int status; + status = mars_recv_struct(sock, mref, mars_mref_meta); + if (status < 0) + goto done; + if (mref->ref_rw) { + if (!mref->ref_data) + mref->ref_data = kzalloc(mref->ref_len, GFP_MARS); + if (!mref->ref_data) { + status = -ENOMEM; + goto done; + } + status = mars_recv(sock, mref->ref_data, mref->ref_len, mref->ref_len); + if (status < 0) + MARS_ERR("mref_len = %d, status = %d\n", mref->ref_len, status); + } +done: + return status; +} +EXPORT_SYMBOL_GPL(mars_recv_mref); + +int mars_send_cb(struct socket **sock, struct mref_object *mref) +{ + struct mars_cmd cmd = { + .cmd_code = CMD_CB, + .cmd_int1 = mref->ref_id, + }; + int status; + status = mars_send_struct(sock, &cmd, mars_cmd_meta); + if (status < 0) + goto done; + status = mars_send_struct(sock, mref, mars_mref_meta); + if (status < 0) + goto done; + if (!mref->ref_rw) { + MARS_IO("sending blocklen = %d\n", mref->ref_len); + status = mars_send(sock, mref->ref_data, mref->ref_len); + } +done: + return status; +} +EXPORT_SYMBOL_GPL(mars_send_cb); + +int mars_recv_cb(struct socket **sock, struct mref_object *mref) +{ + int status; + status = mars_recv_struct(sock, mref, mars_mref_meta); + if (status < 0) + goto done; + if (!mref->ref_rw) { + if (!mref->ref_data) { + MARS_ERR("no internal buffer available\n"); + status = -EINVAL; + goto done; + } + MARS_IO("receiving blocklen = %d\n", mref->ref_len); + status = mars_recv(sock, mref->ref_data, mref->ref_len, mref->ref_len); + } +done: + return status; +} +EXPORT_SYMBOL_GPL(mars_recv_cb); + +////////////////// module init stuff ///////////////////////// + +static int __init _init_net(void) +{ + MARS_INF("init_net()\n"); + return 0; +} + +static void __exit _exit_net(void) +{ + MARS_INF("exit_net()\n"); +} + +MODULE_DESCRIPTION("MARS network infrastructure"); +MODULE_AUTHOR("Thomas Schoebel-Theuer "); +MODULE_LICENSE("GPL"); + +module_init(_init_net); +module_exit(_exit_net); diff --git a/mars_net.h b/mars_net.h new file mode 100644 index 00000000..b5af5f9b --- /dev/null +++ b/mars_net.h @@ -0,0 +1,70 @@ +// (c) 2011 Thomas Schoebel-Theuer / 1&1 Internet AG +#ifndef MARS_NET_H +#define MARS_NET_H + +#include +#include +#include + +#include "brick.h" + +#define MARS_DEFAULT_PORT 7777 + +struct mars_tcp_params { + int tcp_timeout; + int window_size; + int tcp_keepcnt; + int tcp_keepintvl; + int tcp_keepidle; + char tos; +}; + +extern struct mars_tcp_params default_tcp_params; + +enum { + CMD_NOP, + CMD_STATUS, + CMD_GETINFO, + CMD_GETENTS, + CMD_CONNECT, + CMD_MREF, + CMD_CB, +}; + +struct mars_cmd { + struct timespec cmd_stamp; // for automatic lamport clock + int cmd_code; + int cmd_int1; + //int cmd_int2; + //int cmd_int3; + char *cmd_str1; + //char *cmd_str2; + //char *cmd_str3; +}; + +extern const struct meta mars_cmd_meta[]; + +/* Low-level network traffic + */ +extern int mars_create_sockaddr(struct sockaddr_storage *addr, const char *spec); +extern int mars_create_socket(struct socket **sock, struct sockaddr_storage *addr, bool is_server); +extern int mars_send(struct socket **sock, void *buf, int len); +extern int mars_recv(struct socket **sock, void *buf, int minlen, int maxlen); + +/* Mid-level generic field data exchange + */ +extern int mars_send_struct(struct socket **sock, void *data, const struct meta *meta); +extern int mars_recv_struct(struct socket **sock, void *data, const struct meta *meta); + +/* High-level transport of mars structures + */ +extern int mars_send_dent_list(struct socket **sock, struct list_head *anchor); +extern int mars_recv_dent_list(struct socket **sock, struct list_head *anchor); + +extern int mars_send_mref(struct socket **sock, struct mref_object *mref); +extern int mars_recv_mref(struct socket **sock, struct mref_object *mref); +extern int mars_send_cb(struct socket **sock, struct mref_object *mref); +extern int mars_recv_cb(struct socket **sock, struct mref_object *mref); + + +#endif diff --git a/mars_server.c b/mars_server.c new file mode 100644 index 00000000..26513fd8 --- /dev/null +++ b/mars_server.c @@ -0,0 +1,516 @@ +// (c) 2010 Thomas Schoebel-Theuer / 1&1 Internet AG + +// Server brick (just for demonstration) + +//#define BRICK_DEBUGGING +//#define MARS_DEBUGGING +//#define IO_DEBUGGING + +#ifdef IO_DEBUGGING +#define MARS_IO MARS_DBG +#else +#define MARS_IO(args...) /*empty*/ +#endif + +#include +#include +#include +#include + +#define _STRATEGY +#include "mars.h" + +///////////////////////// own type definitions //////////////////////// + +#include "mars_server.h" + +static struct socket *server_socket = NULL; +static struct task_struct *server_thread = NULL; + +///////////////////////// own helper functions //////////////////////// + +static int server_checker(const char *path, const char *name, int namlen, unsigned int d_type, int *prefix, int *serial) +{ + return 0; +} + +static int server_worker(struct mars_global *global, struct mars_dent *dent, bool direction) +{ + return 0; +} + +static void server_endio(struct generic_callback *cb) +{ + struct server_mref_aspect *mref_a; + struct mref_object *mref; + struct server_brick *brick; + struct socket **sock; + int status; + + mref_a = cb->cb_private; + CHECK_PTR(mref_a, err); + mref = mref_a->object; + CHECK_PTR(mref, err); + brick = mref_a->brick; + CHECK_PTR(brick, err); + sock = mref_a->sock; + CHECK_PTR(sock, err); + + down(&brick->socket_sem); + status = mars_send_cb(sock, mref); + up(&brick->socket_sem); + + if (status < 0) { + MARS_ERR("cannot send response, status = %d\n", status); + kernel_sock_shutdown(*sock, SHUT_WR); + } + atomic_dec(&brick->in_flight); + return; +err: + MARS_FAT("cannot handle callback - giving up\n"); +} + +static int handler_thread(void *data) +{ + struct server_brick *brick = data; + struct socket **sock = &brick->handler_socket; + int max_round = 300; + int timeout; + int status = 0; + + brick->handler_thread = NULL; + wake_up_interruptible(&brick->startup_event); + MARS_DBG("--------------- handler_thread starting on socket %p\n", *sock); + if (!*sock) + goto done; + + //fake_mm(); + + while (!kthread_should_stop()) { + struct mars_cmd cmd = {}; + + status = mars_recv_struct(sock, &cmd, mars_cmd_meta); + if (status < 0) { + MARS_ERR("command status = %d\n", status); + break; + } + + MARS_IO("cmd = %d\n", cmd.cmd_code); + + status = -EPROTO; + switch (cmd.cmd_code) { + case CMD_NOP: + MARS_DBG("got NOP operation\n"); + status = 0; + break; + case CMD_STATUS: + //... + MARS_ERR("NYI\n"); + break; + case CMD_GETINFO: + { + struct mars_info info = {}; + status = GENERIC_INPUT_CALL(brick->inputs[0], mars_get_info, &info); + if (status < 0) { + break; + } + status = mars_send_struct(sock, &cmd, mars_cmd_meta); + if (status < 0) { + break; + } + status = mars_send_struct(sock, &info, mars_info_meta); + break; + } + case CMD_GETENTS: + { + struct mars_global glob_tmp = { + .dent_anchor = LIST_HEAD_INIT(glob_tmp.dent_anchor), + .brick_anchor = LIST_HEAD_INIT(glob_tmp.brick_anchor), + .mutex = __SEMAPHORE_INITIALIZER(glob_tmp.mutex, 1), + }; + + status = -EINVAL; + if (!cmd.cmd_str1) + break; + + status = mars_dent_work(&glob_tmp, cmd.cmd_str1, sizeof(struct mars_dent), server_checker, server_worker, NULL, cmd.cmd_int1); + MARS_DBG("dents status = %d\n", status); + if (status < 0) + break; + + down(&brick->socket_sem); + status = mars_send_dent_list(sock, &glob_tmp.dent_anchor); + up(&brick->socket_sem); + + if (status < 0) { + MARS_ERR("could not send dentry information, status = %d\n", status); + } + + mars_dent_free_all(&glob_tmp.dent_anchor); + break; + } + case CMD_CONNECT: + { + struct mars_brick *prev; + + //TODO: fix possible races + prev = mars_find_brick(mars_global, NULL, cmd.cmd_str1); if (likely(prev)) { + status = generic_connect((void*)brick->inputs[0], (void*)prev->outputs[0]); + } else { + MARS_ERR("cannot find brick '%s'\n", cmd.cmd_str1 ? cmd.cmd_str1 : "NULL"); + status = -EINVAL; + } + + cmd.cmd_int1 = status; + status = mars_send_struct(sock, &cmd, mars_cmd_meta); + break; + } + case CMD_MREF: + { + struct mref_object *mref; + struct server_mref_aspect *mref_a; + + mref = server_alloc_mref(&brick->hidden_output, &brick->mref_object_layout); + status = -ENOMEM; + if (!mref) + break; + mref_a = server_mref_get_aspect(&brick->hidden_output, mref); + if (unlikely(!mref_a)) { + kfree(mref); + break; + } + + status = mars_recv_mref(sock, mref); + if (status < 0) + break; + + mref_a->brick = brick; + mref_a->sock = sock; + mref->_ref_cb.cb_private = mref_a; + mref->_ref_cb.cb_fn = server_endio; + mref->ref_cb = &mref->_ref_cb; + atomic_inc(&brick->in_flight); + + status = GENERIC_INPUT_CALL(brick->inputs[0], mref_get, mref); + if (status < 0) { + MARS_INF("execution error = %d\n", status); + mref->_ref_cb.cb_error = status; + server_endio(&mref->_ref_cb); + mars_free_mref(mref); + status = 0; // continue serving requests + break; + } + + GENERIC_INPUT_CALL(brick->inputs[0], mref_io, mref); + GENERIC_INPUT_CALL(brick->inputs[0], mref_put, mref); + break; + } + case CMD_CB: + MARS_ERR("oops, as a server I should never get CMD_CB; something is wrong here - attack attempt??\n"); + break; + default: + MARS_ERR("unknown command %d\n", cmd.cmd_code); + } + if (status < 0) + break; + } + + //kernel_sock_shutdown(*sock, SHUT_WR); + sock_release(*sock); + //cleanup_mm(); + +done: + MARS_DBG("handler_thread terminating, status = %d\n", status); + mars_power_button((void*)brick, false); + do { + int status; + if (!brick->ops || !brick->ops->brick_switch) { + MARS_FAT("cannot switch off - this will do no real harm, but leave a memory leak\n"); + break; + } + status = brick->ops->brick_switch(brick); + if (status < 0) { + MARS_ERR("server shutdown failed, status = %d\n", status); + } else if (max_round-- < 0) + break; + msleep(1000); + } while (!brick->power.led_off); + + if (brick->inputs[0] && brick->inputs[0]->connect) { + MARS_DBG("disconnecting input %p\n", brick->inputs[0]->connect); + (void)generic_disconnect((void*)brick->inputs[0]); + } + + timeout = 60 * 1000; + while (atomic_read(&brick->in_flight) || !brick->power.led_off) { + MARS_ERR("server brick has resources allocated - cannot terminate thread\n"); + msleep(timeout); + if (timeout < 3600 * 1000) + timeout += 30 * 1000; + } + + (void)generic_brick_exit_full((void*)brick); + MARS_DBG("done\n"); + return 0; +} + +////////////////// own brick / input / output operations ////////////////// + +static int server_get_info(struct server_output *output, struct mars_info *info) +{ + struct server_input *input = output->brick->inputs[0]; + return GENERIC_INPUT_CALL(input, mars_get_info, info); +} + +static int server_ref_get(struct server_output *output, struct mref_object *mref) +{ + struct server_input *input = output->brick->inputs[0]; + return GENERIC_INPUT_CALL(input, mref_get, mref); +} + +static void server_ref_put(struct server_output *output, struct mref_object *mref) +{ + struct server_input *input = output->brick->inputs[0]; + GENERIC_INPUT_CALL(input, mref_put, mref); +} + +static void server_ref_io(struct server_output *output, struct mref_object *mref) +{ + struct server_input *input = output->brick->inputs[0]; + GENERIC_INPUT_CALL(input, mref_io, mref); +} + +static int server_switch(struct server_brick *brick) +{ + if (brick->power.button) { + mars_power_led_off((void*)brick, false); + + MARS_INF("starting....."); + + mars_power_led_on((void*)brick, true); + } else { + mars_power_led_on((void*)brick, false); + mars_power_led_off((void*)brick, true); + } + return 0; +} + +//////////////// object / aspect constructors / destructors /////////////// + +static int server_mref_aspect_init_fn(struct generic_aspect *_ini, void *_init_data) +{ + struct server_mref_aspect *ini = (void*)_ini; + (void)ini; + return 0; +} + +static void server_mref_aspect_exit_fn(struct generic_aspect *_ini, void *_init_data) +{ + struct server_mref_aspect *ini = (void*)_ini; + (void)ini; +} + +MARS_MAKE_STATICS(server); + +////////////////////// brick constructors / destructors //////////////////// + +static int server_brick_construct(struct server_brick *brick) +{ + struct server_output *hidden = &brick->hidden_output; + _server_output_init(brick, hidden, "internal"); + init_waitqueue_head(&brick->startup_event); + sema_init(&brick->socket_sem, 1); + return 0; +} + +static int server_output_construct(struct server_output *output) +{ + return 0; +} + +///////////////////////// static structs //////////////////////// + +static struct server_brick_ops server_brick_ops = { + .brick_switch = server_switch, +}; + +static struct server_output_ops server_output_ops = { + .make_object_layout = server_make_object_layout, + .mars_get_info = server_get_info, + .mref_get = server_ref_get, + .mref_put = server_ref_put, + .mref_io = server_ref_io, +}; + +const struct server_input_type server_input_type = { + .type_name = "server_input", + .input_size = sizeof(struct server_input), +}; + +static const struct server_input_type *server_input_types[] = { + &server_input_type, +}; + +const struct server_output_type server_output_type = { + .type_name = "server_output", + .output_size = sizeof(struct server_output), + .master_ops = &server_output_ops, + .output_construct = &server_output_construct, + .aspect_types = server_aspect_types, + .layout_code = { + [BRICK_OBJ_MREF] = LAYOUT_ALL, + } +}; + +static const struct server_output_type *server_output_types[] = { + &server_output_type, +}; + +const struct server_brick_type server_brick_type = { + .type_name = "server_brick", + .brick_size = sizeof(struct server_brick), + .max_inputs = 1, + .max_outputs = 0, + .master_ops = &server_brick_ops, + .default_input_types = server_input_types, + .default_output_types = server_output_types, + .brick_construct = &server_brick_construct, +}; +EXPORT_SYMBOL_GPL(server_brick_type); + +/////////////////////////////////////////////////////////////////////// + +// strategy layer + +static int _server_thread(void *data) +{ + char *id = my_id(); + int version = 0; + int status = 0; + + //fake_mm(); + + MARS_INF("-------- server starting on host '%s' ----------\n", id); + + while (!kthread_should_stop()) { + int size; + struct server_brick *brick; + struct task_struct *thread; + struct socket *new_socket = NULL; + int status; + status = kernel_accept(server_socket, &new_socket, O_NONBLOCK); + if (status < 0) { + msleep(500); + if (status == -EAGAIN) + continue; // without error message + MARS_ERR("accept status = %d\n", status); + continue; + } + if (!new_socket) { + MARS_ERR("got no socket\n"); + msleep(3000); + continue; + } + MARS_DBG("got new connection %p\n", new_socket); + + /* TODO: check authorization. + */ + + size = server_brick_type.brick_size + + (server_brick_type.max_inputs + server_brick_type.max_outputs) * sizeof(void*) + + sizeof(struct server_input), + + brick = kzalloc(size, GFP_MARS); + if (!brick) { + MARS_ERR("cannot allocate server instance\n"); + goto err; + } + + status = generic_brick_init_full(brick, size, (void*)&server_brick_type, NULL, NULL, NULL); + if (status) { + MARS_ERR("cannot init server brick, status = %d\n", status); + goto err; + } + + thread = kthread_create(handler_thread, brick, "mars_handler%d", version++); + if (IS_ERR(thread)) { + MARS_ERR("cannot create thread, status = %ld\n", PTR_ERR(thread)); + goto err; + } + brick->handler_thread = thread; + brick->handler_socket = new_socket; + wake_up_process(thread); + wait_event_interruptible(brick->startup_event, brick->handler_thread == NULL); + continue; + + err: + if (new_socket) { + kernel_sock_shutdown(new_socket, SHUT_WR); + sock_release(new_socket); + } + } + + MARS_INF("-------- cleaning up ----------\n"); + + //cleanup_mm(); + + MARS_INF("-------- done status = %d ----------\n", status); + server_thread = NULL; + return status; +} + +////////////////// module init stuff ///////////////////////// + +static int __init init_server(void) +{ + struct sockaddr_storage sockaddr = {}; + struct task_struct *thread; + int status; + + MARS_INF("init_server()\n"); + + status = mars_create_sockaddr(&sockaddr, ""); + if (status < 0) + return status; + + status = mars_create_socket(&server_socket, &sockaddr, true); + if (status < 0) + return status; + + status = kernel_listen(server_socket, 100); + if (status < 0) + return status; + + thread = kthread_create(_server_thread, NULL, "mars_server"); + if (IS_ERR(thread)) { + return PTR_ERR(thread); + } + + server_thread = thread; + wake_up_process(thread); + + return server_register_brick_type(); +} + +static void __exit exit_server(void) +{ + MARS_INF("exit_server()\n"); + server_unregister_brick_type(); + if (server_thread) { + if (server_socket) { + kernel_sock_shutdown(server_socket, SHUT_WR); + } + kthread_stop(server_thread); + if (server_socket && !server_thread) { + sock_release(server_socket); + server_socket = NULL; + } + } +} + +MODULE_DESCRIPTION("MARS server brick"); +MODULE_AUTHOR("Thomas Schoebel-Theuer "); +MODULE_LICENSE("GPL"); + +module_init(init_server); +module_exit(exit_server); diff --git a/mars_server.h b/mars_server.h new file mode 100644 index 00000000..942beab9 --- /dev/null +++ b/mars_server.h @@ -0,0 +1,40 @@ +// (c) 2010 Thomas Schoebel-Theuer / 1&1 Internet AG +#ifndef MARS_SERVER_H +#define MARS_SERVER_H + +#include + +#include "mars_net.h" + +//extern struct socket *server_socket; +//extern struct task_struct *server_thread; +//extern wait_queue_head_t server_event; + +struct server_mref_aspect { + GENERIC_ASPECT(mref); + struct server_brick *brick; + struct socket **sock; +}; + +struct server_output { + MARS_OUTPUT(server); +}; + +struct server_brick { + MARS_BRICK(server); + atomic_t in_flight; + struct socket *handler_socket; + struct semaphore socket_sem; + struct task_struct *handler_thread; + wait_queue_head_t startup_event; + struct generic_object_layout mref_object_layout; + struct server_output hidden_output; +}; + +struct server_input { + MARS_INPUT(server); +}; + +MARS_TYPES(server); + +#endif diff --git a/mars_test.c b/mars_test.c index 4051a96e..8498f91b 100644 --- a/mars_test.c +++ b/mars_test.c @@ -77,7 +77,7 @@ static struct device_sio_brick *_device_brick = NULL; void make_test_instance(void) { - static char *names[] = { "brick" }; + static const char *names[] = { "brick" }; struct generic_output *first = NULL; struct generic_output *inter = NULL; struct generic_input *last = NULL; @@ -154,7 +154,8 @@ void make_test_instance(void) #ifdef CONF_FDSYNC _device_brick->outputs[0]->o_fdsync = true; #endif - device_brick->ops->brick_switch(device_brick, true); + mars_power_button((void*)device_brick, true); + device_brick->ops->brick_switch(device_brick); first = device_brick->outputs[0]; // last @@ -269,7 +270,8 @@ void make_test_instance(void) MARS_INF("------------- START GATE --------------\n"); - _if_brick->ops->brick_switch(_if_brick, true); + mars_power_button((void*)if_brick, true); + _if_brick->ops->brick_switch(_if_brick); //_if_brick->is_active = true; msleep(2000); diff --git a/mars_trans_logger.c b/mars_trans_logger.c index 9210d013..b0102ea4 100644 --- a/mars_trans_logger.c +++ b/mars_trans_logger.c @@ -1,6 +1,6 @@ // (c) 2010 Thomas Schoebel-Theuer / 1&1 Internet AG -// Trans_Logger brick (just for demonstration) +// Trans_Logger brick //#define BRICK_DEBUGGING //#define MARS_DEBUGGING @@ -17,10 +17,9 @@ #include "mars_trans_logger.h" -//#define inline /**/ +#if 0 #define inline __attribute__((__noinline__)) -#define _noinline /**/ -//#define _noinline __attribute__((__noinline__)) +#endif //////////////////////////////////////////////////////////////////// @@ -35,7 +34,7 @@ static inline bool q_cmp(struct pairing_heap_mref *_a, struct pairing_heap_mref _PAIRING_HEAP_FUNCTIONS(static,mref,q_cmp); -static inline void q_init(struct logger_queue *q) _noinline +static inline void q_init(struct logger_queue *q) { INIT_LIST_HEAD(&q->q_anchor); q->heap_low = NULL; @@ -69,7 +68,7 @@ always_done: return res; } -static inline void q_insert(struct logger_queue *q, struct trans_logger_mref_aspect *mref_a) _noinline +static inline void q_insert(struct logger_queue *q, struct trans_logger_mref_aspect *mref_a) { unsigned long flags; @@ -89,7 +88,7 @@ static inline void q_insert(struct logger_queue *q, struct trans_logger_mref_asp traced_unlock(&q->q_lock, flags); } -static inline void q_pushback(struct logger_queue *q, struct trans_logger_mref_aspect *mref_a) _noinline +static inline void q_pushback(struct logger_queue *q, struct trans_logger_mref_aspect *mref_a) { unsigned long flags; @@ -107,7 +106,7 @@ static inline void q_pushback(struct logger_queue *q, struct trans_logger_mref_a traced_unlock(&q->q_lock, flags); } -static inline struct trans_logger_mref_aspect *q_fetch(struct logger_queue *q) _noinline +static inline struct trans_logger_mref_aspect *q_fetch(struct logger_queue *q) { struct trans_logger_mref_aspect *mref_a = NULL; unsigned long flags; @@ -158,7 +157,7 @@ static inline struct trans_logger_mref_aspect *q_fetch(struct logger_queue *q) _ ///////////////////////// own helper functions //////////////////////// -static inline int hash_fn(loff_t base_index) _noinline +static inline int hash_fn(loff_t base_index) { // simple and stupid loff_t tmp; @@ -222,7 +221,7 @@ static struct trans_logger_mref_aspect *hash_find(struct hash_anchor *table, lof return res; } -static inline void hash_insert(struct hash_anchor *table, struct trans_logger_mref_aspect *elem_a, atomic_t *cnt) _noinline +static inline void hash_insert(struct hash_anchor *table, struct trans_logger_mref_aspect *elem_a, atomic_t *cnt) { loff_t base_index = elem_a->object->ref_pos >> REGION_SIZE_BITS; int hash = hash_fn(base_index); @@ -242,7 +241,7 @@ static inline void hash_insert(struct hash_anchor *table, struct trans_logger_mr traced_writeunlock(&start->hash_lock, flags); } -static inline bool hash_put(struct hash_anchor *table, struct trans_logger_mref_aspect *elem_a, atomic_t *cnt) _noinline +static inline bool hash_put(struct hash_anchor *table, struct trans_logger_mref_aspect *elem_a, atomic_t *cnt) { struct mref_object *elem = elem_a->object; loff_t base_index = elem->ref_pos >> REGION_SIZE_BITS; @@ -327,7 +326,7 @@ static int _write_ref_get(struct trans_logger_output *output, struct trans_logge } mref_a->output = output; - mref_a->stamp = CURRENT_TIME; + get_lamport(&mref_a->stamp); mref->ref_flags = MREF_UPTODATE; mref_a->shadow_ref = mref_a; // cyclic self-reference atomic_set(&mref->ref_count, 1); @@ -354,6 +353,13 @@ static int trans_logger_ref_get(struct trans_logger_output *output, struct mref_ if (mref->ref_may_write == READ) { return _read_ref_get(output, mref_a); } + + /* FIXME: THIS IS PROVISIONARY (use event instead) + */ + while (unlikely(!output->brick->power.led_on)) { + msleep(2 * HZ); + } + return _write_ref_get(output, mref_a); err: @@ -463,7 +469,7 @@ static void trans_logger_ref_io(struct trans_logger_output *output, struct mref_ MARS_DBG("hashing %d at %lld\n", mref->ref_len, mref->ref_pos); hash_insert(output->hash_table, mref_a, &output->hash_count); q_insert(&output->q_phase1, mref_a); - wake_up(&output->event); + wake_up_interruptible(&output->event); } return; } @@ -526,7 +532,7 @@ static void phase1_endio(struct generic_callback *cb) // queue up for the next phase q_insert(&output->q_phase2, orig_mref_a); - wake_up(&output->event); + wake_up_interruptible(&output->event); err: ; } @@ -567,6 +573,13 @@ static bool phase1_startio(struct trans_logger_mref_aspect *orig_mref_a) goto err; } atomic_inc(&output->q_phase1.q_flying); + + /* NYI Provisionary! this is wrong! + * All requests must be sorted according to pos, + * only the smallest _uncommitted_ write-back + * should be counting! + */ + brick->current_pos = brick->logst.log_pos; return true; err: @@ -605,7 +618,7 @@ static void phase2_endio(struct generic_callback *cb) } else { q_insert(&output->q_phase4, sub_mref_a); } - wake_up(&output->event); + wake_up_interruptible(&output->event); err: ; } @@ -715,7 +728,7 @@ static void phase3_endio(struct generic_callback *cb) // queue up for the next phase q_insert(&output->q_phase4, sub_mref_a); - wake_up(&output->event); + wake_up_interruptible(&output->event); err: ; } @@ -795,7 +808,7 @@ put: //MARS_INF("put ORIGREF.\n"); CHECK_ATOMIC(&orig_mref->ref_count, 1); trans_logger_ref_put(orig_mref_a->output, orig_mref); - wake_up(&output->event); + wake_up_interruptible(&output->event); err: ; } @@ -845,8 +858,6 @@ err: /********************************************************************* * The logger thread. * There is only a single instance, dealing with all requests in parallel. - * So there is less need for locking (concept stolen from microkernel - * architectures). */ static int run_queue(struct logger_queue *q, bool (*startio)(struct trans_logger_mref_aspect *sub_mref_a), int max) @@ -868,17 +879,26 @@ static int run_queue(struct logger_queue *q, bool (*startio)(struct trans_logger return 0; } -static int trans_logger_thread(void *data) +static inline int _congested(struct trans_logger_output *output) { - struct trans_logger_output *output = data; - struct trans_logger_brick *brick; + return atomic_read(&output->q_phase1.q_queued) + || atomic_read(&output->q_phase1.q_flying) + || atomic_read(&output->q_phase2.q_queued) + || atomic_read(&output->q_phase2.q_flying) + || atomic_read(&output->q_phase3.q_queued) + || atomic_read(&output->q_phase3.q_flying) + || atomic_read(&output->q_phase4.q_queued) + || atomic_read(&output->q_phase4.q_flying); +} + +static +void trans_logger_log(struct trans_logger_output *output) +{ + struct trans_logger_brick *brick = output->brick; int wait_jiffies = HZ; int last_jiffies = jiffies; - brick = output->brick; - MARS_INF("logger has started.\n"); - - while (!kthread_should_stop()) { + while (!kthread_should_stop() || _congested(output)) { int status; wait_event_interruptible_timeout( @@ -886,7 +906,8 @@ static int trans_logger_thread(void *data) q_is_ready(&output->q_phase1) || q_is_ready(&output->q_phase2) || q_is_ready(&output->q_phase3) || - q_is_ready(&output->q_phase4), + q_is_ready(&output->q_phase4) || + (kthread_should_stop() && !_congested(output)), wait_jiffies); #if 1 if (((int)jiffies) - last_jiffies >= HZ * 10 && atomic_read(&output->hash_count) > 0) { @@ -923,6 +944,73 @@ static int trans_logger_thread(void *data) (void)run_queue(&output->q_phase4, phase4_startio, 64); } } +} + +static +void trans_logger_replay(struct trans_logger_output *output) +{ + struct trans_logger_brick *brick = output->brick; + + MARS_INF("NYI simulating replay at %lld....\n", brick->current_pos); + msleep(15 * 1000); + MARS_INF("NYI simulated replay finished at %lld....\n", brick->end_pos); + brick->current_pos = brick->end_pos; + mars_trigger(); + + while (!kthread_should_stop()) { + msleep(1000); + } +} + +static +int trans_logger_thread(void *data) +{ + struct trans_logger_output *output = data; + struct trans_logger_brick *brick = output->brick; + + MARS_INF("........... logger has started.\n"); + + brick->current_pos = brick->start_pos; + mars_power_led_on((void*)brick, true); + + if (brick->do_replay) { + trans_logger_replay(output); + } else { + trans_logger_log(output); + } + + MARS_INF("........... logger has stopped.\n"); + mars_power_led_off((void*)brick, true); + return 0; +} + +static +int trans_logger_switch(struct trans_logger_brick *brick) +{ + static int index = 0; + struct trans_logger_output *output = brick->outputs[0]; + + if (brick->power.button) { + mars_power_led_off((void*)brick, false); + if (!output->thread) { + output->thread = kthread_create(trans_logger_thread, output, "mars_logger%d", index++); + if (IS_ERR(output->thread)) { + int error = PTR_ERR(output->thread); + MARS_ERR("cannot create thread, status=%d\n", error); + output->thread = NULL; + return error; + } + get_task_struct(output->thread); + wake_up_process(output->thread); + } + } else { + mars_power_led_on((void*)brick, false); + if (output->thread) { + kthread_stop(output->thread); + put_task_struct(output->thread); + output->thread = NULL; + } + } return 0; } @@ -956,7 +1044,6 @@ static int trans_logger_brick_construct(struct trans_logger_brick *brick) static int trans_logger_output_construct(struct trans_logger_output *output) { - static int index = 0; int i; for (i = 0; i < TRANS_HASH_MAX; i++) { struct hash_anchor *start = &output->hash_table[i]; @@ -969,13 +1056,6 @@ static int trans_logger_output_construct(struct trans_logger_output *output) q_init(&output->q_phase2); q_init(&output->q_phase3); q_init(&output->q_phase4); - output->thread = kthread_create(trans_logger_thread, output, "mars_logger%d", index++); - if (IS_ERR(output->thread)) { - int error = PTR_ERR(output->thread); - MARS_ERR("cannot create thread, status=%d\n", error); - return error; - } - wake_up_process(output->thread); return 0; } @@ -987,6 +1067,7 @@ static int trans_logger_input_construct(struct trans_logger_input *input) ///////////////////////// static structs //////////////////////// static struct trans_logger_brick_ops trans_logger_brick_ops = { + .brick_switch = trans_logger_switch, }; static struct trans_logger_output_ops trans_logger_output_ops = { @@ -1040,13 +1121,13 @@ EXPORT_SYMBOL_GPL(trans_logger_brick_type); static int __init init_trans_logger(void) { - printk(MARS_INFO "init_trans_logger()\n"); + MARS_INF("init_trans_logger()\n"); return trans_logger_register_brick_type(); } static void __exit exit_trans_logger(void) { - printk(MARS_INFO "exit_trans_logger()\n"); + MARS_INF("exit_trans_logger()\n"); trans_logger_unregister_brick_type(); } diff --git a/mars_trans_logger.h b/mars_trans_logger.h index 5c07395e..394d3f2b 100644 --- a/mars_trans_logger.h +++ b/mars_trans_logger.h @@ -54,8 +54,14 @@ struct trans_logger_brick { MARS_BRICK(trans_logger); struct log_status logst; // parameters - bool log_reads; - int limit_congest; // limit phase1 congestion. + int sequence; // logfile sequence number + int limit_congest;// limit phase1 congestion. + bool do_replay; // mode of operation + bool log_reads; // additionally log pre-images + loff_t start_pos; // where to start replay + loff_t end_pos; // end of replay + // readonly from outside + loff_t current_pos; // current replay position }; struct trans_logger_output { diff --git a/userspace/marsadm b/userspace/marsadm new file mode 100644 index 00000000..625337d1 --- /dev/null +++ b/userspace/marsadm @@ -0,0 +1,279 @@ +#!/usr/bin/perl -w +# (c) 2010 Thomas Schoebel-Theuer / 1&1 Internet AG + +use strict; +use English; +use warnings; + +my $mars = "/mars"; +my $host = `uname -n` or die "cannot determine my network node name\n"; +chomp $host; +my $ip = `ip a` or die "cannot determine my IP address\n"; +$ip =~ s/\A.*inet +(?!127\.0\.)([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+).*\Z/$1/ms or die "cannot parse my IP address\n"; + +################################################################## + +# syntactic checks + +sub check_id { + my $str = shift; + die "identifier '$str' has disallowed characters" unless $str =~ m/^[A-Za-z_][A-Za-z0-9_]*$/; + die "identifier '$str' is too long (only 16 chars allowed)" if length($str) > 16; +} + +################################################################## + +# semantic checks + +sub check_res { + my $res = shift; + die "resource '$res' does not exist\n" unless -d "$mars/$res"; +} + +sub check_size { + my $arg = shift; + return -1 unless $arg =~ m/^[0-9.]+[kmgtp]?$/i; + my $mod = $arg; + $arg =~ s/[^0-9]+$//; + $mod =~ s/^[0-9]+//; + $_ = $mod; + SWITCH: { + /k/i and $arg *= 1024, last SWITCH; + /m/i and $arg *= 1024 * 1024, last SWITCH; + /g/i and $arg *= 1024 * 1024 * 1024, last SWITCH; + /t/i and $arg *= 1024 * 1024 * 1024 * 1024, last SWITCH; + /p/i and $arg *= 1024 * 1024 * 1024 * 1024 * 1024, last SWITCH; + } + return $arg; +} + +################################################################## + +# commands + +sub ignore_cmd { + my ($cmd, $res) = @_; + print "ignoring command '$cmd' on resource '$res'\n"; + exit(0); +} + +sub senseless_cmd { + my ($cmd, $res) = @_; + print "command '$cmd' makes no sense with MARS (ignoring)\n"; + exit(0); +} + +sub forbidden_cmd { + my ($cmd, $res) = @_; + die "command '$cmd' cannot be used with MARS (it is impossible to carry out uniquely and could therefore lead to a disaster)\n"; +} + +sub nyi_cmd { + my ($cmd, $res) = @_; + die "command '$cmd' is not yet implemented\n"; +} + +sub join_system { + my ($cmd, $peer, $force) = @_; + if(-e glob("$mars/resource-*")) { + die "Sorry, some resources already exist!\nThis is dangerous!\nIf you are sure that no resource clash is possible, re-invoke this command with '--force' option\n" unless ($force and $force =~ m/--force/); + } + print "joining system via rsync (peer='$peer')\n"; + system("mkdir $mars") unless -d $mars; + system("mkdir $mars/ips") unless -d "$mars/ips"; + system("rsync --recursive --links -v $peer:$mars/ips/ $mars/ips/") == 0 or die "oops\n"; + symlink($ip, "$mars/ips/ip-$host"); +} + +sub create_res { + my ($cmd, $res, $dev, $appear) = @_; + my $create = ($cmd eq "create-resource"); + + die "undefined device or size argument\n" unless $dev; + die "resource '$res' already exists\n" if -d "$mars/resource-$res"; + check_id($appear) if $appear; + + if($create) { + print "creating new resource '$res'\n"; + } else { + print "joining to existing resource '$res'\n"; + } + + my $size = check_size($dev); + if($size > 0) { + $dev = ""; + } else { + die "block device '$dev' does not exist\n" unless -b $dev; + die "block device '$dev' must be an absolute path starting with '/'\n" unless $dev =~ m/^\//; + } + + my $tmp = "$mars/.tmp.$res"; + my $primary; + if($create) { + system("mkdir $mars") unless -d $mars; + system("mkdir $mars/ips") unless -d "$mars/ips"; + symlink($ip, "$mars/ips/ip-$host"); + system("rm -rf $tmp"); + system("mkdir $tmp") == 0 or die "could not create resource '$res'\n"; + } else { + $primary = readlink("$tmp/primary") or die "cannot determine primary\n"; + $tmp = "$mars/resource-$res"; + die "resource '$res' does not exist\n" unless -d $tmp; + die "resource '$res' is already joined\n" if -e "$tmp/data-$host"; + die "my ip '$ip' is not registered -- please run 'join-system' first\n" unless -l "$mars/ips/ip-$host"; + } + + my $file = "$tmp/data-$host"; + if($size > 0) { + print "creating sparse file '$file' with size $size\n"; + open(OUT, ">$file") or die "could not open '$file'\n"; + use Fcntl 'SEEK_SET'; + sysseek(OUT, $size-1, SEEK_SET) == $size-1 or die "could not seek\n"; + syswrite(OUT, '\0', 1) == 1 or die "cannot init sparse file\n"; + close OUT; + } else { + print "using existing device '$dev'\n"; + symlink($dev, $file) or die "cannot create device symlink\n"; + } + if($appear) { + # TODO: check for uniqeness of $appear + print "resource '$res' will appear as local device '/dev/mars/$appear'\n"; + system("rm -f $tmp/device-$host"); + symlink($appear, "$tmp/device-$host") or die "cannot create symlink for local device appearance\n"; + } + + if($create) { + symlink($host, "$tmp/primary") or die "cannot create primary symlink\n"; + symlink("log-000000001-$host,0", "$tmp/replay-$host") or die "cannot create replay status\n"; + rename($tmp, "$mars/resource-$res") or die "cannot finalize resource '$res'\n"; + print "successfully created resource '$res'\n"; + } else { + system("rm -f $tmp/syncstatus-$host"); + symlink("0", "$tmp/syncstatus-$host") or die "cannot start initial sync\n"; + system("rm -f $tmp/connect-$host"); + symlink($primary, "$tmp/connect-$host") or die "cannot create peer symlink\n"; + symlink($host, "$tmp/connect-$primary") unless ( -l "$tmp/connect-$primary" or -l "$tmp/off.connect-$primary" ); + print "successfully joined resource '$res'\n"; + } +} + +sub attach_res { + my ($cmd, $res) = @_; + my $detach = ($cmd eq "detach"); + if($detach) { + die "resource '$res' is not attached\n" unless -l "$mars/$res/data-$host"; + rename("$mars/$res/data-$host", "$mars/$res/off.data-$host") or die "operation failed\n"; + } else { + die "resource '$res' is not detached\n" unless -l "$mars/$res/off.data-$host"; + rename("$mars/$res/off.data-$host", "$mars/$res/data-$host") or die "operation failed\n"; + } +} + +sub connect_res { + my ($cmd, $res) = @_; + my $disconnect = ($cmd eq "disconnect"); + my $src = "off."; + my $dst = ""; + if($disconnect) { + $dst = $src; + $src = ""; + } + rename("$mars/$res/${src}connect-$host", "$mars/$res/${dst}connect-$host"); +} + +sub up_res { + my ($cmd, $res) = @_; + my $down = ($cmd eq "down"); + if($down) { + connect_res("disconnect", $res); + attach_res("detach", $res); + } else { + attach_res("attach", $res); + connect_res("connect", $res); + } +} + +sub primary_res { + my ($cmd, $res) = @_; + my $sec = ($cmd eq "secondary"); + my $pri = "$mars/$res/primary"; + my $old = readlink($pri) or die "cannot determine current primary\n"; + if($sec) { + die "for safety reasons, switching to secondary is only allowed when I am primary\n" if($old ne $host); + $host = "(none)"; + } elsif($old eq $host) { + print "I am already primary.\n"; + exit(0); + } + + # TODO: check whether we can switch without interrupting service.... + + my $tmp = "$mars/$res/.tmp.primary"; + system("rm -f $tmp"); + symlink($host, $tmp) or die "cannot create new primary symlink\n"; + rename($tmp, $pri) or die "cannot install new primary symlink\n"; + print "primary changed from '$old' to '$host'\n"; +} + +sub role_cmd { + my ($cmd, $res) = @_; + my $pri = "$mars/$res/primary"; + my $old = readlink($pri) or die "cannot determine current primary\n"; + if($old eq $host) { + print "primary\n"; + } else { + print "secondary\n"; + } +} + +################################################################## + +my %cmd_table = + ( + # new keywords + "join-system" => \&join_system, + "create-resource" => \&create_res, + "join-resource" => \&create_res, + + # compatible keywords + "attach" => \&attach_res, + "detach" => \&attach_res, + "connect" => \&connect_res, + "disconnect" => \&connect_res, + "syncer" => \&ignore_cmd, + "up" => \&up_res, + "down" => \&up_res, + "primary" => \&primary_res, + "secondary" => \&primary_res, + "invalidate" => \&nyi_cmd, + "invalidate-remote" => \&forbidden_cmd, + "resize" => \&nyi_cmd, + "create-md" => \&senseless_cmd, + "get-gi" => \&ignore_cmd, + "show-gi" => \&ignore_cmd, + "dump-md" => \&senseless_cmd, + "outdate" => \&ignore_cmd, + "adjust" => \&senseless_cmd, + "wait-connect" => \&nyi_cmd, + "role" => \&role_cmd, + "state" => \&role_cmd, + "cstate" => \&nyi_cmd, + "status" => \&nyi_cmd, + "dump" => \&senseless_cmd, + "verify" => \&nyi_cmd, + "pause-sync" => \&nyi_cmd, + "resume-sync" => \&nyi_cmd, + "new-current-uuid" => \&senseless_cmd, + "dstate" => \&nyi_cmd, + "hidden-commands" => \&ignore_cmd, + ); + +my $cmd = shift || die "command argument is missing\n"; +my $res = shift || die "resource argument is missing\n"; + +die "unknown command '$cmd'\n" if !exists $cmd_table{$cmd}; +check_id($res); +check_res($res) unless $cmd =~ m/^(join-system|create-resource)$/; + +my $func = $cmd_table{$cmd}; +&{$func}($cmd, $res, @ARGV);