/*
* Strsed.c
* $Id: strsed.c,v 1.13 2005/08/27 00:07:28 pguyot Exp $
*
* ed(1)/tr(1)-like search, replace, transliterate. See the
* manpage for details. See the README for copyright information.
*
* Usage:
*
* strsed(string, pattern, 0);
* char *string;
* char *pattern;
* or
* strsed(string, pattern, range);
* char *string;
* char *pattern;
* int range[2];
*
*
* Terry Jones
* terry@distel.pcs.com
* ...!{pyramid,unido}!pcsbst!distel!terry
*
* PCS Computer Systeme GmbH
* Pfaelzer-Wald-Str 36
* 8000 Muenchen 90
* West Germany 49-89-68004288
*
* January 8th, 1990.
*
*/
/*
* strsed.c,v $
*
* Revision 2.4 90/04/19 15:40:38 terry
* Made it possible to use any delimiter. E.g. s/ex/on/ is the same as
* s.ex.on.
* Fixed trailing backslash bugger. Made range always contain something
* after search and replace to indicate if a substitute actually occurred.
*
* Revision 2.3 90/04/17 19:36:12 terry
* Made realloc ok too....
*
* Revision 2.2 90/04/17 19:27:02 terry
* Did things to make malloc() and free() calls more portable.
*
* Revision 2.1 90/04/15 18:06:09 terry
* Added changes suggested by John B. Thiel. Added empty regs and empty
* regexp structs.
*
* Revision 2.0 90/04/09 16:06:19 terry
* Added dollops of #ifdef's to deal with Henry Spencer or
* GNU regex packages. Also added optimisation that saves the
* last compiled pattern. All seems to work fine except the
* register reference inside a regex with the HS stuff.
*
* Revision 1.19 90/04/09 11:57:01 terry
* little things.
*
* Revision 1.17 90/03/08 20:44:32 terry
* Final cleanup.
*
* Revision 1.16 90/03/07 15:46:35 terry
* Changed backslash_eliminate to only malloc on
* REPLACEMENT type. Added ".*" optimisation so that
* the regex functions are never called.
*
* Revision 1.15 90/03/06 22:27:49 terry
* Removed varargs stuff since the 3rd argument is now
* compulsory. Cleaned up. A few comments even.
*
* Revision 1.14 90/03/06 21:50:28 terry
* Touched up memory stuff. Added mem_find(). Changed
* buf_sz and buf_inc to be a reasonable refelection
* of the length of the input.
*
* Revision 1.13 90/03/06 20:22:48 terry
* Major rearrangements. Added mem(), mem_init(), mem_save(),
* mem_free() to handle memory in a vastly improved fashion.
* Calls to malloc are minimised as far as possible.
*
* Revision 1.12 90/03/06 13:23:33 terry
* Made map static.
*
* Revision 1.11 90/01/10 15:51:12 terry
* checked in with -k by terry at 90.01.18.20.03.08.
*
* Revision 1.11 90/01/10 15:51:12 terry
* *** empty log message ***
*
* Revision 1.10 90/01/10 12:48:40 terry
* Fixed handling of perverted character ranges in nextch().
* a-f-c now means a-c.
*
* Revision 1.9 90/01/10 12:03:48 terry
* Pounded on space allocation, added more_space,
* remove free() in build_map, tested tiny buffer sizes etc.
*
* Revision 1.8 90/01/09 18:15:12 terry
* added backslash elimination to str.
* altered backslash_elimantion to take one of three types
* REGEX, NORMAL or REPLACEMENT depending on the
* elimination desired. Changed interpretation of \
* followed by a single digit to be that character if the
* type of elimination is NORMAL. i.e. \4 = ^D.
*
* Revision 1.7 90/01/09 17:05:05 terry
* Frozen version for release to comp.sources.unix
*
* Revision 1.6 90/01/09 16:47:54 terry
* Altered pure searching return values to be -1
*
* Revision 1.5 90/01/09 14:54:34 terry
* *** empty log message ***
*
* Revision 1.4 90/01/09 14:51:04 terry
* removed #include <stdio> silliness.
*
* Revision 1.2 90/01/09 10:48:22 terry
* Fixed handling of } and - metacharacters inside
* transliteration request strings in backslash_eliminate().
*
* Revision 1.1 90/01/08 17:41:35 terry
* Initial revision
*
*
*/
#define HS_REGEX 1
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#ifdef GNU_REGEX
#include "regex.h"
#endif
#ifdef HS_REGEX
#include <sys/types.h>
#include <regex.h>
#endif
#define BYTEWIDTH 8
#define REGEX 0
#define REPLACEMENT 1
#define NORMAL 2
/*
* And this is supposed to make freeing easier. It's a little hard to
* keep track of what can and cannot be freed in what follows, so I
* ignore it and every time a malloc is done for one of the things
* below (and these are the only ones possible) we free if need be and
* then alloc some more if it can't be avoided. No-one (who is going
* to free) needs to call malloc then. And no-one need call free.
* Wonderful in theory...
*/
#define MEM_STR 0
#define MEM_PAT 1
#define MEM_FROM 2
#define MEM_TO 3
#define MEM_NEWSTR 4
#define MEM_MAP 5
#define MEM_MAP_SAVE 6
#define MEM_SLOTS 7
/*
* This calls mem_free(), which free()s all the allocated storage EXCEPT
* for the piece whose address is 'n'. If something goes wrong below
* we call RETURN(0) and if we want to return some address we call RETURN
* with the address to be returned.
*/
#define RETURN(n) \
mem_free(n); \
if (exp_regs != NULL) \
free(exp_regs); \
return (char *)n
static struct {
char *s;
int size;
int used;
} mem_slots[MEM_SLOTS];
#define more_space(need) \
if (need > 0 && space != -1){ \
if (space - (need) < 0){ \
buf_sz += buf_inc + (need) - space; \
if (!(new_str = (char *)realloc(new_str, (unsigned)buf_sz))){ \
RETURN(0); \
} \
mem_slots[MEM_NEWSTR].s = new_str; \
mem_slots[MEM_NEWSTR].size = buf_sz; \
space = buf_inc; \
} \
else{ \
space -= need; \
} \
}
#ifdef GNU_REGEX
#define NO_MATCH -1
#define EMPTY_REGISTER -1
#endif
#ifdef HS_REGEX
#define NO_MATCH 0
#define EMPTY_REGISTER ((regoff_t) 0)
#endif
/* ------------------------------------------------------------------------- **
* Prototypes
* ------------------------------------------------------------------------- */
static char *mem();
static void mem_init();
static void mem_free();
static char *build_map();
static char nextch();
static void mem_save();
static int mem_find();
/* ------------------------------------------------------------------------- **
* strsed
* ------------------------------------------------------------------------- */
char *
strsed(string, pattern, range)
register char *string;
register char *pattern;
int *range;
{
#ifdef GNU_REGEX
extern char *re_compile_pattern();
extern int re_search();
static struct re_pattern_buffer re_comp_buf;
struct re_registers regs;
static struct re_registers empty_regs;
#endif
#ifdef HS_REGEX
static regmatch_t *exp_regs = NULL;
static regex_t exp;
#endif
char *backslash_eliminate();
char *from;
char *new_str;
char *pat;
char *str;
char *tmp;
char *to;
static char map[1 << BYTEWIDTH];
int buf_sz;
int buf_inc;
int global = 0;
int match;
int new_pos = 0;
int search_only = 0;
int seenbs = 0;
int space;
int match_all = 0;
register int str_len;
static int first_time = 1;
static char *last_exp = (char *)0;
int repeat;
char delimiter;
if (!string || !pattern){
RETURN(0);
}
/*
* If this is the first time we've been called, clear the memory slots.
*/
if (first_time){
#ifdef GNU_REGEX
register int i;
#endif
mem_init();
#ifdef GNU_REGEX
/* Zero the fake regs that we use if the regex is ".*" */
for (i = 0; i < RE_NREGS; i++){
empty_regs.start[i] = empty_regs.end[i] = EMPTY_REGISTER;
}
#endif
#ifdef HS_REGEX
/* We use first_time again if we are GNU_REGEX, and reset it later. */
first_time = 0;
#endif
}
/*
* Take our own copies of the string and pattern since we promised
* in the man page not to hurt the originals.
*/
str = mem(MEM_STR, strlen(string) + 1);
str[0] = '\0';
strcat(str, string);
pat = mem(MEM_PAT, strlen(pattern) + 1);
pat[0] = '\0';
strcat(pat, pattern);
/*
* If escape sequences are not already removed elsewhere, remove
* them from the string. If you don't know what you're doing here
* or are in any doubt, don't define ESCAPED_STRING.
*/
#ifndef ESCAPED_STRING
if (!(str = backslash_eliminate(str, NORMAL, MEM_STR))){
RETURN(0);
}
#endif
str_len = strlen(str);
/*
* Set up the size of our buffer (in which we build the
* newstring, and the size by which we increment it when
* (and if) the need arises. There shouldn't be too much
* growth in the average case. Of course some people will
* go and do things like
*
* strsed(string, "s/.*$/\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0")
*
* and they will be somewhat penalised. Oh well.
*
*/
buf_sz = str_len < 8 ? 16 : str_len << 1;
buf_inc = buf_sz;
/*
* Get the action.
* s = substitue and g = global.
* anything else is invalid.
*
* If one of these is present, the next char is the delimiter.
* Otherwise the character is taken as the delimiter itself.
* This is more flexible, for example the following are all
* legal:
*
* s/pinto/bean/
* /pinto/bean
* /pinto/
* g/pinto/bean/
*
*/
switch (*pat){
case 'g':{
global = 1;
pat++;
break;
}
case 's':{
pat++;
break;
}
default:{
break;
}
}
if (!*pat){
RETURN(0);
}
delimiter = *pat++;
/*
* Now split 'pat' into its two components. These are delimited (or
* should be) by (unquoted) 'delimiter'. The first we point to with 'from'
* and the second with 'to'.
*
* Someone should write a function to make this sort of thing trivial...
*
*/
from = to = pat;
while (*to){
if (seenbs){
seenbs = 0;
}
else{
if (*to == '\\'){
seenbs = 1;
}
else if (*to == delimiter){
break;
}
}
to++;
}
if (!*to){
RETURN(0);
}
*to++ = '\0';
if (*to){
tmp = to + strlen(to) - 1;
/*
* Make sure that the last character is the delimiter,
* and wasn't preceded by \.
*
*/
if (*tmp != delimiter || *(tmp - 1) == '\\'){
RETURN(0);
}
*tmp = '\0';
}
else{
/*
* Search only.
* It doesn't make sense to say
*
* strsed(string, "g/abc/", range)
*
* because we are only searching and returning the
* matched indexes. So turn off global (in case it's on)
* so that we will return just the first instance.
*
* If no range has been given either, then there's no
* point in going on.
*
*/
if (!range){
RETURN(0);
}
global = 0;
search_only = 1;
}
/*
* Eliminate backslashes and character ranges etc.
* Check that 'to' is a non-empty string before bothering
* to try and eliminate things.
*
*/
if (!(from = backslash_eliminate(from, REGEX, MEM_FROM))){
RETURN(0);
}
if (to && !(to = backslash_eliminate(to, REPLACEMENT, MEM_TO))){
RETURN(0);
}
/*
* If the first char of 'to' is '\0' then we are deleting or
* searching only. We don't have to worry about space since
* the transformed string will be less than or equal in length
* to the original. We just overwrite.
* We set space = -1 so that later on we can avoid worrying
* about overflow etc.
*
* Otherwise, we are doing a substitution. Here we have to
* worry about space because the replacement may be larger
* than the original. malloc some room and if we overflow it
* later we will realloc. Slows things down if the new string
* turns out to be too much bigger. Oh well.
*
*/
if (*to){
if (!(new_str = mem(MEM_NEWSTR, buf_sz + 1))){
RETURN(0);
}
space = buf_sz;
}
else{
new_str = str;
space = -1;
}
/*
* Check to see if the regexp is the same as last time.
* If so, we can save ourselves a call to regexec (or whatever
* function your regex package uses).
*
*/
if (last_exp){
if (!strcmp(from, last_exp)){
repeat = 1;
}
else{
free(last_exp);
last_exp = strdup(from);
repeat = 0;
}
}
else {
last_exp = strdup(from);
repeat = 0;
}
/*
* Initialise the range integers to -1, since they may be checked after we
* return, even if we are not just searching.
*/
if (range){
range[0] = range[1] = -1;
}
/*
* Check for the special case where the regex is ".*" since
* then we can save a call to compile and to match, since we
* know what will happen. We can just fake it.
*
*/
if (from[0] == '.' && from[1] == '*' && from[2] == '\0'){
match_all = 1;
/*
* For safety's sake, clear out the register values.
* There might be a register reference in the replacement.
* There will be nothing in the registers (since the search
* pattern was ".*"). Since we aren't calling the regex
* stuff we can't rely on it to set these to -1 (or 0 - as the
* case may be).
*/
#ifdef GNU_REGEX
regs = empty_regs;
#endif
}
#ifdef GNU_REGEX
/*
* Do the first_time check for GNU. Notice the "else" here. We don't
* want to do this if the regex is ".*", even if it is our first time.
*/
else{
if (first_time){
if (!(re_comp_buf.buffer = (char *)malloc((unsigned)200))){
RETURN(0);
}
re_comp_buf.allocated = 200;
if (!(re_comp_buf.fastmap = (char *)malloc((unsigned)1 << BYTEWIDTH))){
RETURN(0);
}
first_time = 0;
}
if (!repeat){
re_comp_buf.translate = 0;
re_comp_buf.used = 0;
}
}
#endif
/*
* If we are not optimising a ".*" or repeating the regex we had last time,
* compile the regular expression.
*/
if (!match_all && !repeat){
#ifdef GNU_REGEX
if (re_compile_pattern(from, strlen(from), &re_comp_buf)){
RETURN(0);
}
#endif
#ifdef HS_REGEX
if (regcomp(&exp, from, 0) != 0){
RETURN(0);
}
#endif
}
/*
* Now get on with the matching/replacing etc.
*/
do {
if (match_all){
/* Fake a match instead of calling re_search() or regexec(). */
match = 1;
#ifdef GNU_REGEX
regs.start[0] = 0;
regs.end[0] = str_len;
#endif
}
else{
#ifdef GNU_REGEX
match = re_search(&re_comp_buf, str, str_len, 0, str_len, ®s);
#endif
#ifdef HS_REGEX
/* XXX Not even trying to use custom memory routines */
if (!(exp_regs = calloc(str_len, sizeof(regmatch_t)))) {
return 0;
}
match = regexec(&exp, str, str_len, exp_regs, 0) ? NO_MATCH : 1;
#endif
}
if (search_only){
/*
* Show what happened and return.
*/
#ifdef GNU_REGEX
range[0] = match == NO_MATCH ? -1 : regs.start[0];
range[1] = match == NO_MATCH ? -1 : regs.end[0];
#endif
#ifdef HS_REGEX
range[0] = match == NO_MATCH ? -1 : exp_regs[0].rm_so;
range[1] = match == NO_MATCH ? -1 : exp_regs[0].rm_eo;
#endif
RETURN(str);
}
if (match != NO_MATCH){
register int need;
/* Set up the range so it can be used later if the caller wants it. */
if (range){
#ifdef GNU_REGEX
range[0] = regs.start[0];
range[1] = regs.end[0];
#endif
#ifdef HS_REGEX
range[0] = exp_regs[0].rm_so;
range[1] = exp_regs[0].rm_eo;
#endif
}
/*
* Copy that portion that was not matched. It will
* be unchanged in the output string.
*
*/
#ifdef GNU_REGEX
need = regs.start[0];
#endif
#ifdef HS_REGEX
need = exp_regs[0].rm_so;
#endif
if (need > 0){
more_space(need);
strncpy(new_str + new_pos, str, need);
new_pos += need;
}
/*
* Put in the replacement text (if any).
* We substitute the contents of 'to', watching for register
* references.
*/
tmp = to;
while (*tmp){
if (*tmp == '\\' && isdigit(*(tmp + 1))){
/* A register reference. */
register int reg = *(tmp + 1) - '0';
int translit = 0;
#ifdef GNU_REGEX
need = regs.end[reg] - regs.start[reg];
#endif
#ifdef HS_REGEX
need = exp_regs[reg].rm_eo - exp_regs[reg].rm_so;
#endif
/*
* Check for a transliteration request.
*
*/
if (*(tmp + 2) == '{'){
/* A transliteration table. Build the map. */
if (!(tmp = build_map(tmp + 2, map))){
RETURN(0);
}
translit = 1;
}
else{
tmp += 2;
translit = 0;
}
more_space(need);
/*
* Copy in the register contents (if it matched), transliterating if need be.
*
*/
#ifdef GNU_REGEX
if (regs.start[reg] != EMPTY_REGISTER){
register int i;
for (i = regs.start[reg]; i < regs.end[reg]; i++){
new_str[new_pos++] = translit ? map[str[i]] : str[i];
}
}
#endif
#ifdef HS_REGEX
if (exp_regs[0].rm_so != EMPTY_REGISTER){
register regoff_t s;
for (s = exp_regs[0].rm_so; s < exp_regs[0].rm_eo; s++){
new_str[new_pos++] = translit ? map[s] : s;
}
}
#endif
}
else{
/* A plain character, put it in. */
more_space(1);
new_str[new_pos++] = *tmp++;
}
}
/*
* Move forward over the matched text.
*
*/
#ifdef GNU_REGEX
str += regs.end[0];
str_len -= regs.end[0];
#endif
#ifdef HS_REGEX
str += exp_regs[0].rm_eo;
str_len -= (int)(exp_regs[0].rm_eo - exp_regs[0].rm_so);
#endif
}
} while (global && match != NO_MATCH && *str);
/*
* Copy the final portion of the string. This is the section that
* was not matched (and hence which remains unchanged) by the last
* match. Then we head off home.
*
*/
more_space(str_len);
(void) strcpy(new_str + new_pos, str);
RETURN(new_str);
}
#define DIGIT(x) (isdigit(x) ? (x) - '0' : islower(x) ? (x) + 10 - 'a' : (x) + 10 - 'A')
char *
backslash_eliminate(str, type, who)
char *str;
int type;
int who;
{
/*
* Remove backslashes from the strings. Turn \040 etc. into a single
* character (we allow eight bit values). Currently NUL is not
* allowed.
*
* Turn "\n" and "\t" into '\n' and '\t' characters. Etc.
*
* The string may grow slightly here. Under normal circumstances
* it will stay the same length or get shorter. It is only in the
* case where we have to turn {a-z}{A-Z} into \0{a-z}{A-Z} that
* we add two chars. This only happens when we are doing a REPLACEMENT.
* So we can't overwrite str, and we have to
* malloc. Sad, but the only ways I could find around it (at this
* late stage) were really gross. I allowed an extra
* 100 bytes which should cover most idiotic behaviour.
* I count the extra space and exit nicely if they do do something
* extremely silly.
*
* 'i' is an index into new_str.
*
* 'type' tells us how to interpret escaped characters.
*
* type = REGEX
* if the pattern is a regular expression. If it is then
* we leave escaped things alone (except for \n and \t and
* friends).
*
* type = REPLACEMENT
* if this is a replacement pattern. In this case we change
* \( and \) to ( and ), but leave \1 etc alone as they are
* register references. - becomes a metacharacter between
* { and }.
*
* type = NORMAL
* We do \n and \t elimination, as well as \040 etc, plus
* all other characters that we find quoted we unquote.
* type = NORMAL when we do a backslash elimination on the
* string argument to strsed.
*
* who tells us where to tell mem where to stick the new string.
*
* \{m,n\} syntax (see ed(1)) is not supported.
*
*/
char *new_str;
int extra = 100;
int seenlb = 0;
register int i = 0;
register int seenbs = 0;
int first_half = 0;
if (type == REPLACEMENT){
if (!(new_str = mem(who, strlen(str) + 1 + extra))){
return 0;
}
}
else{
new_str = str;
}
while (*str){
if (seenbs){
seenbs = 0;
switch (*str){
case '\\':{
new_str[i++] = '\\';
str++;
break;
}
case '-':{
if (seenlb){
/* Keep it quoted. */
new_str[i++] = '\\';
}
new_str[i++] = '-';
str++;
break;
}
case '}':{
if (seenlb){
/* Keep it quoted. */
new_str[i++] = '\\';
}
new_str[i++] = '}';
str++;
break;
}
case 'n':{
new_str[i++] = '\n';
str++;
break;
}
case 't':{
new_str[i++] = '\t';
str++;
break;
}
case 's':{
new_str[i++] = ' ';
str++;
break;
}
case 'r':{
new_str[i++] = '\r';
str++;
break;
}
case 'f':{
new_str[i++] = '\f';
str++;
break;
}
case 'b':{
new_str[i++] = '\b';
str++;
break;
}
case 'v':{
new_str[i++] = '\13';
str++;
break;
}
case 'z':{
str++;
break;
}
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':{
char val;
/*
* Three digit octal constant.
*
*/
if (*str >= '0' && *str <= '3' &&
*(str + 1) >= '0' && *(str + 1) <= '7' &&
*(str + 2) >= '0' && *(str + 2) <= '7'){
val = (DIGIT(*str) << 6) +
(DIGIT(*(str + 1)) << 3) +
DIGIT(*(str + 2));
if (!val){
/*
* NUL is not allowed.
*/
return 0;
}
new_str[i++] = val;
str += 3;
break;
}
/*
* One or two digit hex constant.
* If two are there they will both be taken.
* Use \z to split them up if this is not wanted.
*
*/
if (*str == '0' && (*(str + 1) == 'x' || *(str + 1) == 'X') && isxdigit(*(str + 2))){
val = DIGIT(*(str + 2));
if (isxdigit(*(str + 3))){
val = (val << 4) + DIGIT(*(str + 3));
str += 4;
}
else{
str += 3;
}
if (!val){
return 0;
}
new_str[i++] = val;
break;
}
/*
* Two or three decimal digits.
* (One decimal digit is taken as either a register reference
* or as a decimal digit if NORMAL is true below.)
*
*/
if (isdigit(*(str + 1))){
val = DIGIT(*str) * 10 + DIGIT(*(str + 1));
if (isdigit(*(str + 2))){
val = 10 * val + DIGIT(*(str + 2));
str += 3;
}
else{
str += 2;
}
if (!val){
return 0;
}
new_str[i++] = val;
break;
}
/*
* A register reference or else a single decimal digit if this
* is a normal string..
*
* Emit \4 (etc) if we are not NORMAL (unless the digit is a 0
* and we are processing an r.e. This is because \0 makes no
* sense in an r.e., only in a replacement. If we do have \0
* and it is an r.e. we return.)
*
*/
if (*str == '0' && type == REGEX){
return 0;
}
if (type == NORMAL){
if (!(val = DIGIT(*str))){
return 0;
}
new_str[i++] = val;
str++;
}
else{
new_str[i++] = '\\';
new_str[i++] = *str++;
}
break;
}
default:{
if (type == REGEX){
new_str[i++] = '\\';
}
new_str[i++] = *str++;
break;
}
}
}
else{
if (*str == '\\'){
seenbs = 1;
str++;
}
else if (type == REPLACEMENT && *str == '}'){
if (*(str + 1) == '{' && first_half){
new_str[i++] = *str++;
new_str[i++] = *str++;
first_half = 0;
}
else{
seenlb = 0;
new_str[i++] = *str++;
}
}
else if (type == REPLACEMENT && !seenlb && *str == '{'){
/*
* Within { and }, \- should be left as such. So we can differentiate
* between s/fred/\-/ and s/fred/{\-a-z}{+A-Z}
*
* We stick in a "\0" here in the case that \X has not just been
* seen. (X = 0..9) Which is to say, {a-z}{A-Z} defaults to
* \0{a-z}{A-Z}
*
*/
seenlb = 1;
first_half = 1;
if (i < 2 || new_str[i - 2] != '\\' || !(new_str[i - 1] >= '0' && new_str[i - 1] <= '9')){
if ((extra -= 2) < 0){
/* ran out of extra room. */
return 0;
}
new_str[i++] = '\\';
new_str[i++] = '0';
}
new_str[i++] = *str++;
}
else{
/*
* A normal char.
*
*/
new_str[i++] = *str++;
}
}
}
if (seenbs){
/*
* The final character was a '\'. Put it in as a single backslash.
*
*/
new_str[i++] = '\\';
}
new_str[i] = '\0';
return new_str;
}
static char *
build_map(s, map)
char *s;
char *map;
{
/*
* Produce a mapping table for the given transliteration.
* We are passed something that looks like "{a-z}{A-Z}"
* Look out for \ chars, these are used to quote } and -.
*
* Return a pointer to the char after the closing }.
* We cannot clobber s.
*
* The building of maps is somewhat optimised.
* If the string is the same as the last one we were
* called with then we don't do anything. It would be better
* to remember all the transliterations we have seen, in
* order (because in a global substitution we will
* apply them in the same order repeatedly) and then we
* could do the minimum amount of building. This is a
* compromise because it is a fairly safe bet that there will
* not be more than one transliteration done.
*
*/
char *in;
char *out;
char *str;
char *tmp;
char c;
int i = 0;
int range_count = 0;
int seenbs = 0;
static char *last = 0;
static int last_len;
out = 0;
if (!s){
return 0;
}
if (last && !strncmp(s, last, last_len)){
/* Re-use the map. */
return s + last_len;
}
else{
/*
* Make a copy of s in both 'last' and 'str'
*/
int len = strlen(s) + 1;
if (!(str = mem(MEM_MAP, len)) || !(last = mem(MEM_MAP_SAVE, len))){
return 0;
}
str[0] = last[0] = '\0';
strcat(str, s);
strcat(last, s);
}
tmp = str + 1;
in = str;
while (*tmp){
if (seenbs){
if (*tmp == '-'){
/*
* Keep the \ before a - since this is the range
* separating metacharacter. We don't keep } quoted,
* we just put it in. Then it is passed as a normal
* char (no longer a metachar) to nextch().
*
*/
str[i++] = '\\';
}
str[i++] = *tmp++;
seenbs = 0;
}
else{
if (*tmp == '\\'){
seenbs = 1;
tmp++;
}
else if (*tmp == '}'){
if (!range_count){
/* seen first range. */
range_count = 1;
str[i++] = '\0';
tmp++;
while (*tmp == ' ' || *tmp == '\t'){
tmp++;
}
if (*tmp != '{'){
return 0;
}
out = str + i;
tmp++;
}
else{
/* seen both ranges. */
str[i++] = '\0';
tmp++;
range_count = 2;
break;
}
}
else{
/* A plain defenceless character. */
str[i++] = *tmp++;
}
}
}
if (range_count != 2){
return 0;
}
last_len = tmp - str;
/*
* Now 'out' and 'in' both point to character ranges.
* These will look something like "A-Z" but may be
* more complicated and have {} and - in them elsewhere.
*
*/
for (i = 0; i < 1 << BYTEWIDTH; i++){
map[i] = i;
}
/*
* Ready the range expanding function.
*
*/
(void) nextch(in, 0);
(void) nextch(out, 1);
/*
* For each char in 'in', assign it a value in
* 'map' corresponding to the next char in 'out'.
*
*/
while ((c = nextch((char *)0, 0))){
map[(int) c] = nextch((char *)0, 1);
}
return tmp;
}
static char
nextch(str, who)
char *str;
int who;
{
/*
* Given a range like {a-z0237-9}
* return successive characters from the range on
* successive calls. The first call (when str != 0)
* sets things up.
*
* We must handle strange things like
* {a-b-c-z} = {a-z}
* and {z-l-a} = {z-a}
* and {f-f-f-f-h} = {f-h}
* and {a-z-f-h-y-d-b} = {a-b}
*
* and so on.
*
* This function will remember two strings and will return
* the next charcter in the range specified by 'who'. This
* makes the building of the transliteration table above
* a trivial loop.
*
* I can't be bothered to comment this as much as it
* deserves right now... 8-)
*
*/
static char *what[2] = {0, 0};
static char last[2] = {0, 0};
static int increment[2];
static int pos[2];
if (who < 0 || who > 1){
return 0;
}
if (str){
/* Set up for this string. */
what[who] = str;
pos[who] = 0;
return 1;
}
else if (!what[who]){
return 0;
}
if (!pos[who] && what[who][0] == '-'){
return 0;
}
switch (what[who][pos[who]]){
case '-':{
/* we're in mid-range. */
last[who] += increment[who];
if (what[who][pos[who] + 1] == last[who]){
pos[who] += 2;
}
return last[who];
}
case '\0':{
/*
* We've finished. Keep on returning the
* last thing you saw if who = 1.
*/
if (who){
return last[1];
}
return 0;
}
/* FALLTHROUGH */
case '\\':{
pos[who]++;
}
default:{
last[who] = what[who][pos[who]++];
/*
* If we have reached a '-' then this is the start of a
* range. Keep on moving forward until we see a sensible
* end of range character. Then set up increment so that
* we do the right thing next time round. We leave pos
* pointing at the '-' sign.
*
*/
while (what[who][pos[who]] == '-'){
int inc = 1;
if (what[who][pos[who] + inc] == '\\'){
inc++;
}
if (!what[who][pos[who] + inc]){
return 0;
}
if (what[who][pos[who] + inc + 1] == '-'){
pos[who] += inc + 1;
continue;
}
increment[who] = what[who][pos[who] + inc] - last[who];
if (!increment[who]){
pos[who] += 2;
continue;
}
if (increment[who] > 0){
increment[who] = 1;
break;
}
else if (increment[who] < 0){
increment[who] = -1;
break;
}
}
return last[who];
}
}
}
static char *
mem(who, size)
int who;
int size;
{
/*
* Get 'size' bytes of memeory one way or another.
*
* The 'mem_slots' array holds currently allocated hunks.
* If we can use one that's already in use then do so, otherwise
* try and find a hunk not in use somewhere else in the table.
* As a last resort call malloc. All a bit specialised and
* not too clear. Seems to works fine though.
*/
if (who < 0 || who >= MEM_SLOTS){
return 0;
}
if (mem_slots[who].used){
/*
* There is already something here. Either move/free it or
* return it if it is already big enough to hold this request.
*/
if (mem_slots[who].size >= size){
/* It is already big enough. */
return mem_slots[who].s;
}
else{
mem_save(who);
}
}
else{
/*
* The slot was not in use. Check to see if there is space
* allocated here already that we can use. If there is and
* we can, use it, if there is and it's not big enough try to
* save it. if there isn't then try to find it in another free slot,
* otherwise don't worry, the malloc below will get us some.
*/
if (mem_slots[who].s && mem_slots[who].size >= size){
/* We'll take it. */
mem_slots[who].used = 1;
return mem_slots[who].s;
}
if (mem_slots[who].s){
mem_save(who);
}
else{
int x = mem_find(size);
if (x != -1){
mem_slots[who].s = mem_slots[x].s;
mem_slots[who].size = mem_slots[x].size;
mem_slots[who].used = 1;
mem_slots[x].s = (char *)0;
return mem_slots[who].s;
}
}
}
/*
* Have to use malloc 8-(
*/
if (!(mem_slots[who].s = (char *)malloc((unsigned)size))){
return 0;
}
mem_slots[who].size = size;
mem_slots[who].used = 1;
return mem_slots[who].s;
}
static int
mem_find(size)
int size;
{
/*
* See if we can find an unused but allocated slot with 'size'
* (or more) space available. Return the index, or -1 if not.
*/
register int i;
for (i = 0; i < MEM_SLOTS; i++){
if (!mem_slots[i].used && mem_slots[i].s && mem_slots[i].size >= size){
return i;
}
}
return -1;
}
static void
mem_save(x)
int x;
{
/*
* There is some memory in mem_slots[x] and we try to save it rather
* than free it. In order we try to
*
* 1) put it in an unused slot that has no allocation.
* 2) put it in an unused slot that has an allocation smaller than x's
* 3) free it since there are no free slots and all the full ones are bigger.
*
*/
register int i;
register int saved = 0;
/*
* First we try to find somewhere unused and with no present allocation.
*/
for (i = 0; i < MEM_SLOTS; i++){
if (!mem_slots[i].used && !mem_slots[i].s){
saved = 1;
mem_slots[i].s = mem_slots[x].s;
mem_slots[i].size = mem_slots[x].size;
mem_slots[i].used = 0;
break;
}
}
/*
* No luck yet. Try for a place that is not being used but which has
* space allocated, and which is smaller than us (and all other such spots).
* Pick on the smallest, yeah.
*/
if (!saved){
register int small = -1;
register int small_val = 32767; /* Be nice to 16 bit'ers. Non-crucial if it's too low. */
for (i = 0; i < MEM_SLOTS; i++){
if (!mem_slots[i].used && mem_slots[i].size < mem_slots[x].size && mem_slots[i].size < small_val){
small_val = mem_slots[i].size;
small = i;
}
}
if (small != -1){
saved = 1;
/* We got one, now clobber it... */
free(mem_slots[small].s);
/* and move on in. */
mem_slots[small].s = mem_slots[x].s;
mem_slots[small].size = mem_slots[x].size;
mem_slots[small].used = 0;
}
}
if (!saved){
/* Have to toss it away. */
free(mem_slots[x].s);
}
}
static void
mem_init()
{
/*
* Clear all the memory slots.
*/
register int i;
for (i = 0; i < MEM_SLOTS; i++){
mem_slots[i].s = (char *)0;
mem_slots[i].used = 0;
}
}
static void
mem_free(except)
char *except;
{
/*
* "Clear out" all the memory slots. Actually we do no freeing since
* we may well be called again. We just mark the slots as unused. Next
* time round they might be useful - the addresses and sizes are still there.
*
* For the slot (if any) whose address is 'except', we actually set the
* address to 0. This is done because we are called ONLY from the macro
* RETURN() in strsed() and we intend to return the value in 'except'.
* Once this is done, strsed should (in theory) have no knowledge at all
* of the address it passed back last time. That way we won't clobber it
* and cause all sorts of nasty problems.
*/
register int i;
for (i = 0; i < MEM_SLOTS; i++){
mem_slots[i].used = 0;
if (mem_slots[i].s == except){
mem_slots[i].s = (char *)0;
mem_slots[i].size = 0;
}
}
}