Google Summer of Code(2021)
R project for statistical computing

Google Summer of Code(2021) R project for statistical computing

re2r back on CRAN

Project Overview

In 2010 Google released its regular expression library(RE2), it provided speed and security for its users as unlike almost all the regular expression libraries, RE2 didn't use backtracking but finite-state automata instead. The library provided a main interface in C++, however since its first release many wrappers were made to help developers make use of the library regardless the technology.

In this GSOC project the goal was to find solutions to many warnings in the R wrapper (re2r ) for the library that prevents the wrapper to be accepted on CRAN( CRAN is a network of ftp and web servers around the world that store identical, up-to-date, versions of code and documentation for R). CRAN has very strict policies that prevents any package to be accepted if it has one single warning. You might find this article useful if you faced any of these warnings before in a CRAN submission.

Warnings

  • Warning No1 (ISO C++ prohibits anonymous structs [-Wpedantic] )

    This warning was due to unnamed structs in the upstream, in files : regexp.h prog.h. It might seem that the solution for this problem is trivial , that is to simply name the struct. Yes this is actually the solution but the challenge is that other modifications are needed to propagate the update successfully across other functions using this struct.

regexp.h

Before

  union {
    struct {  // Repeat
      int max_;
      int min_;
    };
    struct {  // Capture
      int cap_;
      std::string* name_;
    };
    struct {  // LiteralString
      int nrunes_;
      Rune* runes_;
    };
    struct {  // CharClass
      // These two could be in separate union members,
      // but it wouldn't save any space (there are other two-word structs)
      // and keeping them separate avoids confusion during parsing.
      CharClass* cc_;
      CharClassBuilder* ccb_;
    };
    Rune rune_;  // Literal
    int match_id_;  // HaveMatch
    void *the_union_[2];  // as big as any other element, for memset
  };

  Regexp(const Regexp&) = delete;
  Regexp& operator=(const Regexp&) = delete;
};

After

// Arguments to operator.  See description of operators above.
  union OpArg {
    struct Repeat {  // Repeat
      int max_;
      int min_;
    } repeat_;
    struct Capture {  // Capture
      int cap_;
      std::string* name_;
    } capture_;
    struct LiteralString {  // LiteralString
      int nrunes_;
      Rune* runes_;
    } ltstr_;
    struct ChClass {  // CharClass
      // These two could be in separate union members,
      // but it wouldn't save any space (there are other two-word structs)
      // and keeping them separate avoids confusion during parsing.
      CharClass* cc_;
      CharClassBuilder* ccb_;
    } chclass_;
    Rune rune_;  // Literal
    int match_id_;  // HaveMatch
    void *the_union_[2];  // as big as any other element, for memset
  } oparg_;

  Regexp(const Regexp&) = delete;
  Regexp& operator=(const Regexp&) = delete;
};
  int min() { DCHECK_EQ(op_, kRegexpRepeat); return oparg_.repeat_.min_; }
  int max() { DCHECK_EQ(op_, kRegexpRepeat); return oparg_.repeat_.max_; }
  Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return oparg_.rune_; }
  CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return oparg_.chclass_.cc_; }
  int cap() { DCHECK_EQ(op_, kRegexpCapture); return oparg_.capture_.cap_; }
  const std::string* name() { DCHECK_EQ(op_, kRegexpCapture); return oparg_.capture_.name_; }
  Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return oparg_.ltstr_.runes_; }
  int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return oparg_.ltstr_.nrunes_; }
  int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return oparg_.match_id_; }

prog.h

Before

      struct {             // opcode == kInstByteRange
        uint8_t lo_;       //   byte range is lo_-hi_ inclusive
        uint8_t hi_;       //
        uint16_t hint_foldcase_;  // 15 bits: hint, 1 (low) bit: foldcase
                           //   hint to execution engines: the delta to the
                           //   next instruction (in the current list) worth
                           //   exploring iff this instruction matched; 0
                           //   means there are no remaining possibilities,
                           //   which is most likely for character classes.
                           //   foldcase: A-Z -> a-z before checking range.
      };

      EmptyOp empty_;       // opcode == kInstEmptyWidth
                            //   empty_ is bitwise OR of kEmpty* flags above.
    };

After

      struct OpCode {             // opcode == kInstByteRange
        uint8_t lo_;       //   byte range is lo_-hi_ inclusive
        uint8_t hi_;       //
        uint16_t hint_foldcase_;  // 15 bits: hint, 1 (low) bit: foldcase
                           //   hint to execution engines: the delta to the
                           //   next instruction (in the current list) worth
                           //   exploring iff this instruction matched; 0
                           //   means there are no remaining possibilities,
                           //   which is most likely for character classes.
                           //   foldcase: A-Z -> a-z before checking range.
      } oc_;

      EmptyOp empty_;       // opcode == kInstEmptyWidth
                            //   empty_ is bitwise OR of kEmpty* flags above.
    } ia_;
    int out1()      { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return ia_.out1_; }
    int cap()       { DCHECK_EQ(opcode(), kInstCapture); return ia_.cap_; }
    int lo()        { DCHECK_EQ(opcode(), kInstByteRange); return ia_.oc_.lo_; }
    int hi()        { DCHECK_EQ(opcode(), kInstByteRange); return ia_.oc_.hi_; }
    int foldcase()  { DCHECK_EQ(opcode(), kInstByteRange); return ia_.oc_.hint_foldcase_&1; }
    int hint()      { DCHECK_EQ(opcode(), kInstByteRange); return ia_.oc_.hint_foldcase_>>1; }
    int match_id()  { DCHECK_EQ(opcode(), kInstMatch); return ia_.match_id_; }
    EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return ia_.empty_; }
  • Warning No2 ( ISO C++ forbids flexible array member )

dfa.cc

Before

// Work around the bug affecting flexible array members in GCC 6.x (for x >= 1).
// (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70932)
#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && __GNUC_MINOR__ >= 1
    std::atomic<State*> next_[0];   // Outgoing arrows from State,
#else
    std::atomic<State*> next_[];    // Outgoing arrows from State,
#endif

                        // one per input byte class
  };

After

#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && __GNUC_MINOR__ >= 1
    std::atomic<State*> next_[0];   // Outgoing arrows from State,
#else
    std::atomic<State*> next_[1];    // Outgoing arrows from State,
#endif

                        // one per input byte class

onepass.cc

Before

struct OneState {
  uint32_t matchcond;   // conditions to match right now.
  uint32_t action[];
};

After

struct OneState {
  uint32_t matchcond;   // conditions to match right now.
  uint32_t action[1];
};
  • Warning No3 ( compile.cc:292:69: warning: ‘void memset(void, int, size_t)’ clearing an object of non-trivial type ‘class re2::Prog::Inst’; use assignment or valueinitialization instead [-Wclass-memaccess] )

Before

  if (inst_len_ + n > inst_cap_) {
    if (inst_cap_ == 0)
      inst_cap_ = 8;
    while (inst_len_ + n > inst_cap_)
      inst_cap_ *= 2;
    Prog::Inst* ip = new Prog::Inst[inst_cap_];
    if (inst_len_ > 0) {
      memmove(ip, inst_, inst_len_ * sizeof ip[0]);
    }
    memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]);
    delete[] inst_;
    inst_ = ip;

After

  if (ninst_ + n > inst_.size()) {
    int cap = inst_.size();
    if (cap == 0)
      cap = 8;
    while (ninst_ + n > cap)
      cap *= 2;
    PODArray<Prog::Inst> inst(cap);
    if (inst_.data() != NULL)
      memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]);
    memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]);
    inst_ = std::move(inst);
  • Warning No.4 (Check for unstated dependencies in tests file.)

    This problem only arises in Solaris so it can be solved by tracking the test files that cause this warning and skipping them on “Solaris” This is the list of files that caused the warning :
  • test-unicode.R
  • test-replace.R
  • test-detect.R
  • test-match_group.R
  • test-locate.R
  • test-split.R
  • test-count.R
  • test-subset.R
  • test-extract.R

so in these files I added this line this will skip the test file when run on ‘Solaris’:

skip_on_os("solaris")

Conclusion

This was my first experience with open source and indeed I learned a lot during this period about regular expressions and how to create interfaces for C++ with R . I also learned how to submit packages to CRAN and I am currently working on submitting my own multithreading package in R after I finish the tests. Re2 currently have an accepted wrapper in R ( here ). During the last weeks I also worked on creating a benchmarks for the package ( in R ). The GSOC experience was a great one and my mentor was very helpful and surely I will consider joining next summer.

Next Steps

I am willing to work on implementing the re2_extract() function similar to str_extract function in the R wrapper.