-
Notifications
You must be signed in to change notification settings - Fork 2
StreamInterface
fawzi edited this page May 25, 2011
·
3 revisions
It can be very useful to be able to apply a regex to a stream. Here is sketched an interface that can be used in many different contexts, it is a bit low level, but very flexible.
Important considerations:
- avoid heap allocations
- allow one to make a UTF normalization on the stream using just a buffer (again no heap allocations)
- add one batch at a time, not one char at a time, to allow one to build loops with minimal indirection, even if we support several engines, or decide to implement things using interfaces and virtual methods (and thus some heap allocation)
- there will probably be several Re engines with the same interface
struct DirectIndexing(CharType){
size_t posStart;
void init(){
posStart=0;
}
size_t resolveLocalIdx(ptrdiff_t idxAtt,CharType[] pre,CharType[]att){
assert(idxAtt>0||-idxAtt<=posStart);
return posStart+idxAtt;
}
}
struct IdxGroup{
size_t start=size_t.max;
size_t end=0;
typeof(T.init[start..end]) opCall(T)(T fullStr){
return fullStr[start..end];
}
bool invalid(){
return start>end;
}
// add bool cast ?
}
struct Match(Group){
Group *gStart;
int nGroups;
int[string] namedGroups; // use something that can be stack allocated??
Group[16] gBuf;
Group[] groups(){
if (gStart is null){
return gBuf[0..nGroups];
}
return gStart[0..nGroups];
}
Group opIndex(int i){
if (i<0 || i>=nGroups) throw new Exception("group index out of bounds",__FILE__,__LINE);
if (gStart is null) return gBuf[i];
return gStart[i];
}
Group opIndex(string s){
return opIndex(namedGroups[s]);
}
}
enum MatchKind{
NoMatch,
Match,
PartialMatch,
}
struct Re(CharType=char,IndexTranslator=DirectIndexing!(CharType),Group=IdxGroup){
enum Flags: uint{
None=0,
AtEnd=1,
}
IndexTranslator indexTranslator;
CharType[32] rest;
const CharType[] blockAtt;
int loopBackSize; /// size of valid data in rest (for loopBacks)
uint flags;
ptrdiff_t posAtt; /// use just int??, negative means rest[rest.length+posAtt], positive blockAtt[posAtt]
/// adds extraData to be matched against
/// this should be called only when the current block has been fully used
void addData(const(CharType)[] extraData){
if (blockAtt.length!=0){
throw new Exception("addData called before having fully consumed current data",__FILE__,__LINE__);
}
if ((flags&Flags.AtEnd)!=0){
throw new Exception("addData called after addEnd",__FILE__,__LINE__);
}
blockAtt=extraData;
}
/// internal method to push away the current block and make place for the next
/// should be called only then the position is less than rest.length from the end
void pushAway(){
if (blockAtt.length>posAtt+rest.length){
throw new Exception("pushAway called too far away from end of blockAtt",__FILE__,__LINE__);
}
ptrdiff_t newPos=posAtt-cast(ptrdiff_t)blockAtt.length;
rest[$+newPos..$]=blockAtt[posAtt..$];
posAtt=newPos;
blockAtt=null;
}
/// marks that the current data is all the available data
void addEnd(){
flags|=Flags.AtEnd;
}
/// getting the match for the current regular expression (anchored search)
MatchKind match1(ref MatchType m){
}
/// searches for the next match for the current regular expression
MatchKind searchNext1(ref MatchType m){
}
/// utility method doing a match on a stream
/// loadNext might invalidate the previous chunk of data
MatchType match(const(CharType)[]delegate() loadNext){
MatchType res;
assert(res.invalid());
MatchKind mKind;
while(match1(res)==MatchKind.PartialMatch){
auto newData=loadNext();
if (n.length!=0){
addData(newData);
} else {
addEnd();
}
}
return res;
}
/// utility method doing a search on a stream
/// loadNext might invalidate the previous chunk of data
MatchType searchNext(const(CharType)[]delegate() loadNext){
MatchType res;
assert(res.invalid());
MatchKind mKind;
while(match(res)==MatchKind.PartialMatch){
auto newData=loadNext();
if (n.length!=0){
addData(newData);
} else {
addEnd();
}
}
return res;
}
}