Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions awkward-cpp/include/awkward/forth/ForthMachine.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <set>
#include <map>
#include <stack>
#include <memory>

#include "awkward/common.h"
#include "awkward/util.h"
Expand Down Expand Up @@ -503,7 +504,7 @@ namespace awkward {
int64_t output_initial_size_;
double output_resize_factor_;

T* stack_buffer_;
std::unique_ptr<T[]> stack_buffer_;
int64_t stack_depth_;
int64_t stack_max_depth_;

Expand All @@ -521,22 +522,22 @@ namespace awkward {
std::vector<int64_t> bytecodes_offsets_;
std::vector<I> bytecodes_;

char* string_buffer_;
std::unique_ptr<char[]> string_buffer_;
int64_t string_buffer_size_;

std::vector<std::shared_ptr<ForthInputBuffer>> current_inputs_;
std::vector<std::shared_ptr<ForthOutputBuffer>> current_outputs_;
bool is_ready_;

int64_t* current_which_;
int64_t* current_where_;
std::unique_ptr<int64_t[]> current_which_;
std::unique_ptr<int64_t[]> current_where_;
int64_t recursion_current_depth_;
std::stack<int64_t> recursion_target_depth_;
int64_t recursion_max_depth_;

int64_t* do_recursion_depth_;
int64_t* do_stop_;
int64_t* do_i_;
std::unique_ptr<int64_t[]> do_recursion_depth_;
std::unique_ptr<int64_t[]> do_stop_;
std::unique_ptr<int64_t[]> do_i_;
int64_t do_current_depth_;

util::ForthError current_error_;
Expand Down
10 changes: 4 additions & 6 deletions awkward-cpp/src/libawkward/builder/RecordBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,18 +93,16 @@ namespace awkward {

void
RecordBuilder::clear() {
// Keep the record structure (contents_/keys_/pointers_/keys_size_) intact:
// these parallel arrays must stay consistent or form()/to_buffers() would
// read keys_ out of bounds.
for (auto x : contents_) {
x.get()->clear();
}
keys_.clear();
pointers_.clear();
name_ = "";
nameptr_ = nullptr;
length_ = -1;
length_ = 0;
begun_ = false;
nextindex_ = -1;
nexttotry_ = 0;
keys_size_ = 0;
}

bool
Expand Down
31 changes: 26 additions & 5 deletions awkward-cpp/src/libawkward/forth/ForthInputBuffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ namespace awkward {

uint8_t
ForthInputBuffer::peek_byte(int64_t after, util::ForthError& err) noexcept {
if (pos_ + after + 1 > length_) {
int64_t next = pos_ + after + 1;
// reject negative offsets and overflow (next wrapping below pos_)
if (after < 0 || next < pos_ || next > length_) {
err = util::ForthError::read_beyond;
return 0;
}
Expand All @@ -28,7 +30,9 @@ namespace awkward {
void*
ForthInputBuffer::read(int64_t num_bytes, util::ForthError& err) noexcept {
int64_t next = pos_ + num_bytes;
if (next > length_) {
// reject negative/overflowed sizes (num_bytes is derived from a
// stack-supplied item count times sizeof(TYPE) and can wrap negative)
if (num_bytes < 0 || next < pos_ || next > length_) {
err = util::ForthError::read_beyond;
return nullptr;
}
Expand Down Expand Up @@ -294,14 +298,31 @@ namespace awkward {
return 0.0;
}

// Accumulate the integral mantissa in int64_t while it fits (up to 18
// digits, which always fits in a signed 64-bit integer), then switch to
// double to avoid signed-integer-overflow UB on long literals.
int64_t integral = 0;
double integral_d = 0.0;
int64_t integral_digits = 0;
bool integral_overflowed = false;
do {
integral *= 10;
integral += ptr[pos_] - '0';
if (!integral_overflowed && integral_digits < 18) {
integral *= 10;
integral += ptr[pos_] - '0';
integral_digits++;
}
else {
if (!integral_overflowed) {
integral_d = (double)integral;
integral_overflowed = true;
}
integral_d *= 10.0;
integral_d += (double)(ptr[pos_] - '0');
}
pos_++;
} while (pos_ != length_ && ptr[pos_] >= '0' && ptr[pos_] <= '9');

double result = (double)integral;
double result = integral_overflowed ? integral_d : (double)integral;

if (pos_ != length_ && ptr[pos_] == '.') {
pos_++;
Expand Down
42 changes: 24 additions & 18 deletions awkward-cpp/src/libawkward/forth/ForthMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,15 +270,7 @@ namespace awkward {
}

template <typename T, typename I>
ForthMachineOf<T, I>::~ForthMachineOf() {
delete [] stack_buffer_;
delete [] string_buffer_;
delete [] current_which_;
delete [] current_where_;
delete [] do_recursion_depth_;
delete [] do_stop_;
delete [] do_i_;
}
ForthMachineOf<T, I>::~ForthMachineOf() = default;

template <typename T, typename I>
int64_t
Expand Down Expand Up @@ -842,7 +834,7 @@ namespace awkward {
template <typename T, typename I>
T
ForthMachineOf<T, I>::stack_at(int64_t from_top) const noexcept {
return stack_buffer_[stack_depth_ - from_top];
return stack_buffer_[stack_depth_ - 1 - from_top];
}

template <typename T, typename I>
Expand Down Expand Up @@ -1428,22 +1420,33 @@ namespace awkward {
bool
ForthMachineOf<T, I>::is_integer(const std::string& word, int64_t& value) const {
if (word.size() >= 2 && word.substr(0, 2) == std::string("0x")) {
std::string digits = word.substr(2, word.size() - 2);
std::size_t pos = 0;
try {
value = (int64_t)std::stoul(word.substr(2, word.size() - 2), nullptr, 16);
value = (int64_t)std::stoll(digits, &pos, 16);
}
catch (std::invalid_argument& err) {
return false;
}
return true;
catch (std::out_of_range& err) {
return false;
}
// reject trailing junk (e.g. "0x12zz")
return pos == digits.size();
}
else {
std::size_t pos = 0;
try {
value = (int64_t)std::stoul(word, nullptr, 10);
value = (int64_t)std::stoll(word, &pos, 10);
}
catch (std::invalid_argument& err) {
return false;
}
return true;
catch (std::out_of_range& err) {
return false;
}
// reject trailing junk (e.g. "123abc")
return pos == word.size();
}
}

Expand Down Expand Up @@ -2541,7 +2544,10 @@ namespace awkward {
}

if (strings_.size() == start_size) {
if (tokenized[(IndexTypeOf<std::string>)pos + 1] == "enum") {
// the "enum"/"enumonly" keyword is at pos - 1 here (pos was
// advanced by 2 from the input-name token at line 2526); reading
// pos + 1 could be out of bounds when no s" strings follow.
if (pos >= 1 && tokenized[(IndexTypeOf<std::string>)pos - 1] == "enum") {
throw std::invalid_argument(
err_linecol(linecol, pos - 2, pos + 1, "need at least one string (s\" word) after \"enum\"")
+ FILENAME(__LINE__)
Expand Down Expand Up @@ -3065,7 +3071,7 @@ namespace awkward {
output = current_outputs_[(IndexTypeOf<int64_t>)out_num].get();
}

uint64_t mask = (1 << bit_width) - 1;
uint64_t mask = (bit_width >= 64) ? ~(uint64_t)0 : ((uint64_t)1 << bit_width) - 1;
uint64_t bits_wnd_l = 8;
uint64_t bits_wnd_r = 0;
int64_t items_remaining = num_items;
Expand Down Expand Up @@ -3191,7 +3197,7 @@ namespace awkward {
input->skipws();
}
int64_t length;
input->read_quotedstr(string_buffer_, string_buffer_size_, length, current_error_);
input->read_quotedstr(string_buffer_.get(), string_buffer_size_, length, current_error_);
if (current_error_ != util::ForthError::none) {
return;
}
Expand All @@ -3201,7 +3207,7 @@ namespace awkward {
}
stack_push((T)length); // note: pushing length
if (length != 0) {
output->write_one_string(string_buffer_, length); // note: copying string
output->write_one_string(string_buffer_.get(), length); // note: copying string
}
}
}
Expand Down
8 changes: 7 additions & 1 deletion awkward-cpp/src/libawkward/forth/ForthOutputBuffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,13 @@ namespace awkward {
if (next > reserved_) {
int64_t reservation = reserved_;
while (next > reservation) {
reservation = (int64_t)std::ceil((double)reservation * (double)resize_);
int64_t grown = (int64_t)std::ceil((double)reservation * (double)resize_);
// Guarantee strict growth: when reservation is 0 or resize_ <= 1.0 the
// geometric step can fail to increase reservation, looping forever.
if (grown <= reservation) {
grown = reservation + 1;
}
reservation = grown;
}
std::shared_ptr<OUT> new_buffer = std::shared_ptr<OUT>(new OUT[(size_t)reservation],
util::array_deleter<OUT>());
Expand Down
2 changes: 2 additions & 0 deletions awkward-cpp/src/libawkward/io/json.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -865,8 +865,10 @@ namespace awkward {
switch (specializedjson_->instruction()) {
case FillIndexedOptionArray:
specializedjson_->push_stack(specializedjson_->current_instruction() + 1);
break;
case KeyTableHeader:
specializedjson_->push_stack(specializedjson_->current_instruction());
break;
}
int64_t keytableheader_instruction = specializedjson_->current_instruction();
int64_t num_fields = specializedjson_->argument1();
Expand Down
4 changes: 2 additions & 2 deletions awkward-cpp/src/libawkward/util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,13 @@ namespace awkward {
case dtype::uint16:
return "H";
case dtype::uint32:
#if defined _MSC_VER || defined INTPTR_MAX == INT32_MAX
#if defined _MSC_VER || INTPTR_MAX == INT32_MAX
return "L";
#else
return "I";
#endif
case dtype::uint64:
#if defined _MSC_VER || defined INTPTR_MAX == INT32_MAX
#if defined _MSC_VER || INTPTR_MAX == INT32_MAX
return "Q";
#else
return "L";
Expand Down
84 changes: 84 additions & 0 deletions tests/test_4105_forth_vm_and_libawkward_fixes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE

from __future__ import annotations

import sys

import numpy as np
import pytest

import awkward as ak

forth_only = pytest.mark.skipif(
sys.byteorder == "big",
reason="AwkwardForth not yet supported on big-endian systems",
)


def test_from_json_nullable_record_missing_option_key():
# A switch fall-through in nulls_for_optiontype left the instruction stack
# unbalanced, raising a spurious "JSON schema mismatch" instead of null-filling.
schema = {
"type": "array",
"items": {
"type": ["object", "null"],
"properties": {
"x": {"type": "integer"},
"y": {"type": ["integer", "null"]},
},
},
}
out = ak.from_json('[{"x": 1}, null, {"x": 2, "y": 3}]', schema=schema)
assert out.tolist() == [{"x": 1, "y": None}, None, {"x": 2, "y": 3}]


def test_arraybuilder_clear_then_reuse():
# RecordBuilder::clear() used to desync the contents_/keys_ parallel arrays
# and leave length_ == -1, breaking __len__ and reuse after clear().
builder = ak.ArrayBuilder()
with builder.record():
builder.field("x").integer(1)
builder.field("y").real(2.5)

ext = builder._layout
assert len(ext) == 1

ext.clear()
assert len(ext) == 0
assert "RecordArray" in ext.form()

with builder.record():
builder.field("x").integer(7)
builder.field("y").real(8.5)
assert len(ext) == 1
assert builder.snapshot().tolist() == [{"x": 7, "y": 8.5}]


@forth_only
def test_forthmachine_output_initial_size_zero():
# maybe_resize() looped forever when output_initial_size == 0 (reservation
# stayed 0 under geometric growth).
vm = ak.forth.ForthMachine64(
"output out int64 5 0 do i out <- stack loop",
output_initial_size=0,
)
vm.run({})
assert vm.output("out").tolist() == [0, 1, 2, 3, 4]


@forth_only
@pytest.mark.parametrize(
("nbits", "nbytes"),
[(32, 4), (64, 8)],
)
def test_forthmachine_nbit_mask_large_widths(nbits, nbytes):
# mask = (1 << bit_width) - 1 shifted an int literal: UB for bit_width >= 31.
expected = (1 << nbits) - 1
vm = ak.forth.ForthMachine64(
f"input data output out uint64 1 data #{nbits}bit-> out",
output_initial_size=64,
)
data = np.frombuffer(b"\xff" * nbytes, dtype=np.uint8)
vm.run({"data": data})
(result,) = vm.output("out").tolist()
assert (result & 0xFFFFFFFFFFFFFFFF) == (expected & 0xFFFFFFFFFFFFFFFF)
Loading