Skip to content

StreamInterface

fawzi edited this page May 25, 2011 · 3 revisions

It can be very useful to be able to apply a regex to a stream. Here is sketched an interface that can be used in many different contexts, it is a bit low level, but very flexible.

Important considerations:

  • avoid heap allocations
  • allow one to make a UTF normalization on the stream using just a buffer (again no heap allocations)
  • add one batch at a time, not one char at a time, to allow one to build loops with minimal indirection, even if we support several engines, or decide to implement things using interfaces and virtual methods (and thus some heap allocation)
  • there will probably be several Re engines with the same interface
struct DirectIndexing(CharType){
    size_t posStart;
    void init(){
        posStart=0;
    }
    size_t resolveLocalIdx(ptrdiff_t idxAtt,CharType[] pre,CharType[]att){
        assert(idxAtt>0||-idxAtt<=posStart);
        return posStart+idxAtt;
    }
}

struct IdxGroup{
    size_t start=size_t.max;
    size_t end=0;
    typeof(T.init[start..end]) opCall(T)(T fullStr){
        return fullStr[start..end];
    }
    bool invalid(){
        return start>end;
    }
    // add bool cast ?
}

struct Match(Group){
    Group *gStart;
    int nGroups;
    int[string] namedGroups; // use something that can be stack allocated??
    Group[16] gBuf;
    
    Group[] groups(){
        if (gStart is null){
            return gBuf[0..nGroups];
        }
        return gStart[0..nGroups];
    }
    Group opIndex(int i){
        if (i<0 || i>=nGroups) throw new Exception("group index out of bounds",__FILE__,__LINE);
        if (gStart is null) return gBuf[i];
        return gStart[i];
    }
    Group opIndex(string s){
        return opIndex(namedGroups[s]);
    }
}

enum MatchKind{
    NoMatch,
    Match,
    PartialMatch,
}

struct Re(CharType=char,IndexTranslator=DirectIndexing!(CharType),Group=IdxGroup){
    enum Flags: uint{
        None=0,
        AtEnd=1,
    }
    IndexTranslator indexTranslator;
    CharType[32] rest;
    const CharType[] blockAtt;
    int loopBackSize; /// size of valid data in rest (for loopBacks)
    uint flags;
    ptrdiff_t posAtt; /// use just int??, negative means rest[rest.length+posAtt], positive blockAtt[posAtt]
    
    /// adds extraData to be matched against
    /// this should be called only when the current block has been fully used
    void addData(const(CharType)[] extraData){
        if (blockAtt.length!=0){
            throw new Exception("addData called before having fully consumed current data",__FILE__,__LINE__);
        }
        if ((flags&Flags.AtEnd)!=0){
            throw new Exception("addData called after addEnd",__FILE__,__LINE__);
        }
        blockAtt=extraData;
    }
    
    /// internal method to push away the current block and make place for the next
    /// should be called only then the position is less than rest.length from the end
    void pushAway(){
        if (blockAtt.length>posAtt+rest.length){
            throw new Exception("pushAway called too far away from end of blockAtt",__FILE__,__LINE__);
        }
        ptrdiff_t newPos=posAtt-cast(ptrdiff_t)blockAtt.length;
        rest[$+newPos..$]=blockAtt[posAtt..$];
        posAtt=newPos;
        blockAtt=null;
    }
    
    /// marks that the current data is all the available data
    void addEnd(){
        flags|=Flags.AtEnd;
    }
    
    /// getting the match for the current regular expression (anchored search)
    MatchKind match1(ref MatchType m){
        
    }
    
    /// searches for the next match for the current regular expression
    MatchKind searchNext1(ref MatchType m){
        
    }
    
    /// utility method doing a match on a stream
    /// loadNext might invalidate the previous chunk of data
    MatchType match(const(CharType)[]delegate() loadNext){
        MatchType res;
        assert(res.invalid());
        MatchKind mKind;
        while(match1(res)==MatchKind.PartialMatch){
            auto newData=loadNext();
            if (n.length!=0){
                addData(newData);
            } else {
                addEnd();
            }
        }
        return res;
    }

    /// utility method doing a search on a stream
    /// loadNext might invalidate the previous chunk of data
    MatchType searchNext(const(CharType)[]delegate() loadNext){
        MatchType res;
        assert(res.invalid());
        MatchKind mKind;
        while(match(res)==MatchKind.PartialMatch){
            auto newData=loadNext();
            if (n.length!=0){
                addData(newData);
            } else {
                addEnd();
            }
        }
        return res;
    }
    
}

Clone this wiki locally