Improved tr

- Added support for character ranges ( a-z )
- Added support for complementary charset ( -c ), only in delete mode
- Added support for octal escape sequences
- Unicode now only works when there are no octal escape sequences,
  otherwise behavior is not predictable at first sight.
- tr now supports null characters in the input
- Does not yet have support for character classes ( [:upper:] )
This commit is contained in:
Adria Garriga 2014-07-15 00:49:42 +02:00 committed by sin
parent 8b3a9c1971
commit b3a63a60e4
2 changed files with 278 additions and 85 deletions

13
tr.1
View File

@ -3,7 +3,7 @@
tr \- translate characters
.SH SYNOPSIS
.B tr
.RB [ \-d ]
.RB [ \-d ] [ \-c ]
.RB set1
.P
.B tr
@ -13,6 +13,9 @@ tr \- translate characters
.TP
.B \-d
For compatibility. If given, characters in set1 will be deleted from the input and specifying set2 will result in an error.
.B \-c
Complementary, causes the specified character set to be inverted, this is all the characters not specified belong to it.
It only works in conjunction with \-d, because order doesn't make much sense with translation.
.SH DESCRIPTION
.B tr
reads input from stdin replacing every character in
@ -50,9 +53,15 @@ If set1 is longer than set2
.B tr
will map all the remaining characters to the last one in set2. In case set2 is longer than set1, the remaining characters from set2 will be ignored.
.B
Character escape sequences, be them characters or octal numbers, are done preceding the token with a "\\". You may specify three digits or less for it,
digits will stop being read when a non-octal character or when three characters are read.
.B
Use "A-B" for ordered sets fom A to B.
.B
.SH NOTES
.B tr
is Unicode-aware but does not yet handle character classes (e.g. [:alnum:] or [:digit:]).
is Unicode-aware, but only if you don't specify characters in octal (for example \\012), because else it is not predictable. Does not support character
classes.
.SH SEE ALSO
.IR sed(1)
.IR awk(1)

350
tr.c
View File

@ -3,7 +3,6 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <locale.h>
#include <wchar.h>
#include "text.h"
@ -12,135 +11,316 @@
static void
usage(void)
{
eprintf("usage: %s [-d] set1 [set2]\n", argv0);
eprintf("usage: %s [-d] [-c] set1 [set2]\n", argv0);
}
static int dflag, cflag;
static wchar_t mappings[0x110000];
struct wset_state {
char *s; /* current character */
wchar_t rfirst, rlast; /* first and last in range */
wchar_t prev; /* previous returned character */
int prev_was_range; /* was the previous character part of a c-c range? */
};
struct set_state {
char *s, rfirst, rlast, prev;
int prev_was_octal; /* was the previous returned character written in octal? */
};
static void
set_state_defaults(struct set_state *s)
{
s->rfirst = 1;
s->rlast = 0;
s->prev_was_octal = 1;
}
static void
handleescapes(char *s)
wset_state_defaults(struct wset_state *s)
{
s->rfirst = 1;
s->rlast = 0;
s->prev_was_range = 1;
}
/* sets *s to the char that was intended to be written.
* returns how many bytes the s pointer has to advance to skip the
* escape sequence if it was an octal, always zero otherwise. */
static int
resolve_escape(char *s)
{
int i;
unsigned char c;
switch(*s) {
case 'n':
*s = '\n';
break;
return 0;
case 't':
*s = '\t';
break;
case '\\':
*s = '\\';
break;
return 0;
case 'r':
*s = '\r';
break;
return 0;
case 'f':
*s = '\f';
break;
return 0;
case 'a':
*s = '\a';
break;
return 0;
case 'b':
*s = '\b';
break;
return 0;
case 'v':
*s = '\v';
break;
return 0;
case '\\':
*s = '\\';
return 0;
case '\0':
eprintf("stray '\\' at end of input:");
default: ;
}
if(*s<'0' || *s>'7')
eprintf("invalid character after '\\':");
for(i=0, c=0; s[i]>='0' && s[i]<='7' && i<3; i++) {
c <<= 3;
c += s[i]-'0';
}
if(*s>'3' && i==3)
eprintf("octal byte cannot be bigger than 377:");
*s = c;
return i;
}
#define embtowc(a, b) mbtowc(a, b, 4)
static int
xmbtowc(wchar_t *unicodep, const char *s)
{
int rv;
rv = mbtowc(unicodep, s, 4);
rv = embtowc(unicodep, s);
if (rv < 0)
eprintf("mbtowc:");
eprintf("mbtowc: invalid input sequence:");
return rv;
}
static void
parsemapping(const char *set1, const char *set2, wchar_t *mappings)
static int
has_octal_escapes(const char *s)
{
char *s1, *s2;
wchar_t runeleft;
wchar_t runeright;
int leftbytes;
int rightbytes;
while(*s)
if(*s++ == '\\' && *s >= '0' && *s <= '7')
return 1;
return 0;
}
s1 = (char *)set1;
if(set2)
s2 = (char *)set2;
else
s2 = (char *)set1;
static char
get_next_char(struct set_state *s)
{
char c;
int nchars;
while(*s1) {
if(*s1 == '\\')
handleescapes(++s1);
leftbytes = xmbtowc(&runeleft, s1);
s1 += leftbytes;
if(*s2 == '\\')
handleescapes(++s2);
if(*s2 != '\0') {
rightbytes = xmbtowc(&runeright, s2);
s2 += rightbytes;
start:
if(s->rfirst <= s->rlast) {
c = s->rfirst;
s->rfirst++;
return c;
}
if(*s->s == '-' && !s->prev_was_octal) {
s->s++;
if(!*s->s)
return '-';
if(*s->s == '\\' && (nchars = resolve_escape(++(s->s))))
goto char_is_octal;
s->rlast = *(s->s)++;
if(!s->rlast)
return '\0';
s->prev_was_octal = 1;
s->rfirst = ++(s->prev);
goto start;
}
if(*s->s == '\\' && (nchars = resolve_escape(++(s->s))))
goto char_is_octal;
s->prev_was_octal = 0;
c = *(s->s)++;
s->prev = c;
return c;
char_is_octal:
s->prev_was_octal = 1;
c = *s->s;
s->s += nchars;
return c;
}
static wchar_t
get_next_wchar(struct wset_state *s)
{
start:
if(s->rfirst <= s->rlast) {
s->prev = s->rfirst;
s->rfirst++;
return s->prev;
}
if(*s->s == '-' && !s->prev_was_range) {
s->s++;
if(!*s->s)
return '-';
if(*s->s == '\\')
resolve_escape(++(s->s));
s->s += xmbtowc(&s->rlast, s->s);
if(!s->rlast)
return '\0';
s->rfirst = ++(s->prev);
s->prev_was_range = 1;
goto start;
}
if(*s->s == '\\')
resolve_escape(++(s->s));
s->s += xmbtowc(&s->prev, s->s);
s->prev_was_range = 0;
return s->prev;
}
static int
is_mapping_wide(const char *set1, const char *set2)
{
struct set_state ss1, ss2;
struct wset_state wss1, wss2;
wchar_t wc1, wc2, last_wc2;
if(has_octal_escapes(set1)) {
set_state_defaults(&ss1);
ss1.s = (char *) set1;
if(set2) {
set_state_defaults(&ss2);
ss2.s = (char *) set2;
/* if the character returned is from an octal triplet, it might be null
and still need to continue */
while((wc1 = (unsigned char) get_next_char(&ss1)) || ss1.prev_was_octal ) {
if(!(wc2 = (unsigned char) get_next_char(&ss2)))
wc2 = last_wc2;
mappings[wc1] = wc2;
last_wc2 = wc2;
}
} else {
while((wc1 = (unsigned char) get_next_char(&ss1)) || ss1.prev_was_octal)
mappings[wc1] = 1;
}
mappings[runeleft] = runeright;
return 0;
} else {
wset_state_defaults(&wss1);
wss1.s = (char *) set1;
if(set2) {
wset_state_defaults(&wss2);
wss2.s = (char *) set2;
while((wc1 = get_next_wchar(&wss1))) {
if(!(wc2 = get_next_wchar(&wss2)))
wc2 = last_wc2;
mappings[wc1] = wc2;
last_wc2 = wc2;
}
} else {
while((wc1 = get_next_wchar(&wss1)))
mappings[wc1] = 1;
}
return 1;
}
return 0; /* unreachable */
}
static void
wmap_null(char *in, ssize_t nbytes)
{
char *s;
wchar_t rune;
int parsed_bytes = 0;
s = in;
while(nbytes) {
parsed_bytes = embtowc(&rune, s);
if(parsed_bytes < 0) {
rune = *s;
parsed_bytes = 1;
}
if(((!mappings[rune])&1) ^ cflag)
putwchar(rune);
s += parsed_bytes;
nbytes -= parsed_bytes;
}
}
static void
maptonull(const wchar_t *mappings, char *in)
wmap_set(char *in, ssize_t nbytes)
{
const char *s;
wchar_t runeleft;
int leftbytes = 0;
char *s;
wchar_t rune;
int parsed_bytes = 0;
s = in;
while(*s) {
leftbytes = xmbtowc(&runeleft, s);
if(!mappings[runeleft])
putwchar(runeleft);
s += leftbytes;
}
}
static void
maptoset(const wchar_t *mappings, char *in)
{
const char *s;
wchar_t runeleft;
int leftbytes = 0;
s = in;
while(*s) {
leftbytes = xmbtowc(&runeleft, s);
if(!mappings[runeleft])
putwchar(runeleft);
while(nbytes) {
parsed_bytes = embtowc(&rune, s);
if(parsed_bytes < 0) {
rune = *s;
parsed_bytes = 1;
}
if(!mappings[rune])
putwchar(rune);
else
putwchar(mappings[runeleft]);
s += leftbytes;
putwchar(mappings[rune]);
nbytes -= parsed_bytes;
s += parsed_bytes;
}
}
static void
map_null(char *in, ssize_t nbytes)
{
char *s;
for(s=in; nbytes; s++, nbytes--)
if(((!mappings[(unsigned char)*s])&1) ^ cflag)
putchar(*s);
}
static void
map_set(char *in, ssize_t nbytes)
{
char *s;
for(s=in; nbytes; s++, nbytes--)
if(!mappings[(unsigned char)*s])
putchar(*s);
else
putchar(mappings[(unsigned char)*s]);
}
int
main(int argc, char *argv[])
{
wchar_t *mappings;
char *buf = NULL;
size_t size = 0;
void (*mapfunc)(const wchar_t*, char*);
int dflag = 0;
ssize_t nbytes;
void (*mapfunc)(char*, ssize_t);
setlocale(LC_ALL, "");
mappings = mmap(NULL, 0x110000 * sizeof(wchar_t),
PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
if (mappings == MAP_FAILED)
eprintf("mmap:");
dflag = cflag = 0;
ARGBEGIN {
case 'd':
dflag = 1;
break;
case 'c':
cflag = 1;
break;
default:
usage();
} ARGEND;
@ -148,25 +328,29 @@ main(int argc, char *argv[])
if(argc == 0)
usage();
if(dflag || argc == 1) {
if(dflag) {
if(argc != 1)
usage();
parsemapping(argv[0], NULL, mappings);
mapfunc = maptonull;
if(is_mapping_wide(argv[0], NULL))
mapfunc = wmap_null;
else
mapfunc = map_null;
} else if(cflag) {
usage();
} else if(argc == 2) {
if(is_mapping_wide(argv[0], argv[1]))
mapfunc = wmap_set;
else
mapfunc = map_set;
} else {
if(argc != 2)
usage();
parsemapping(argv[0], argv[1], mappings);
mapfunc = maptoset;
usage();
}
while(agetline(&buf, &size, stdin) != -1)
mapfunc(mappings, buf);
while((nbytes = agetline(&buf, &size, stdin)) != -1)
mapfunc(buf, nbytes);
free(buf);
if(ferror(stdin))
eprintf("<stdin>: read error:");
munmap(mappings, 0x110000 * sizeof(wchar_t));
return EXIT_SUCCESS;
}