fix some dotenv parsing

2022-10-03 20:46:19 -04:00 · 2022-10-03 20:46:19 -04:00 · 6dc03ae3ce
parent abd02287e6
commit 6dc03ae3ce
4 changed files with 132 additions and 51 deletions
--- a/shared-bindings/dotenv/init.c
+++ b/shared-bindings/dotenv/init.c
@ -40,9 +40,13 @@
 //|    A subset of the CPython `dotenv library <https://saurabh-kumar.com/python-dotenv/>`_. It does
 //|    not support variables or double quotes.
 //|
-//|    The simplest way to define keys and values is to put them in single quotes. \ and ' are
-//|    escaped by \ in single quotes. Newlines can occur in quotes for multiline values. Comments
-//|    start with # and apply for the rest of the line.
+//|    Keys and values may be put in single quotes.
+//|    ``\`` and ``'`` are escaped by ``\`` in single quotes. Newlines can occur in quotes for multiline values.
+//|    Comments start with ``#`` and apply for the rest of the line.
+//|    A ``#`` immediately following an ``=`` is part of the value, not the start of a comment,
+//|    and a ``#`` embedded in a value without whitespace will be part of that value.
+//|    This corresponds to how assignments and comments work in most Unix shells.
+//|
 //|
 //|    File format example:
 //|
@ -58,6 +62,9 @@
 //|        multiline = 'hello
 //|        world
 //|        how are you?'
+//|        # The #'s below will be included in the value. They do not start a comment.
+//|        key6=#value
+//|        key7=abc#def
 //|
 //| """
 //|
--- a/shared-module/dotenv/init.c
+++ b/shared-module/dotenv/init.c
@ -35,72 +35,90 @@
 #include "py/objstr.h"
 #include "supervisor/filesystem.h"

-STATIC uint8_t consume_spaces(FIL *active_file) {
-    uint8_t character = ' ';
-    UINT quantity_read = 1;
-    while (unichar_isspace(character) && quantity_read > 0) {
-        f_read(active_file, &character, 1, &quantity_read);
-    }
+// Return 0 if there is no next character (EOF).
+STATIC uint8_t get_next_character(FIL *active_file) {
+    uint8_t character = 0;
+    UINT quantity_read;
+    // If there's an error or quantity_read is 0, character will remain 0.
+    f_read(active_file, &character, 1, &quantity_read);
+    return character;
+}
+
+// Discard whitespace, except for newlines, returning the next character after the whitespace.
+// Return 0 if there is no next character (EOF).
+STATIC uint8_t consume_whitespace(FIL *active_file) {
+    uint8_t character;
+    do {
+        character = get_next_character(active_file);
+    } while (character != '\n' && character != 0 && unichar_isspace(character));
    return character;
 }

 // Starting at the start of a new line, determines if the key matches the given
-// key. File pointer is left after the = after the key.
+// key. File pointer is set to be just before the = after the key.
 STATIC bool key_matches(FIL *active_file, const char *key) {
-    uint8_t character = ' ';
-    UINT quantity_read = 1;
-    character = consume_spaces(active_file);
+    uint8_t character;
+    character = consume_whitespace(active_file);
+    if (character == 0) {
+        return false;
+    }
    bool quoted = false;
    if (character == '\'') {
+        // Beginning of single-quoted string.
        quoted = true;
-        f_read(active_file, &character, 1, &quantity_read);
+        character = get_next_character(active_file);
    }
    size_t key_pos = 0;
    bool escaped = false;
    bool matches = true;
    size_t key_len = strlen(key);
-    while (quantity_read > 0) {
+    while (character != 0) {
        if (character == '\\' && !escaped && quoted) {
            escaped = true;
        } else if (!escaped && quoted && character == '\'') {
            quoted = false;
-            // Move past the quoted before breaking so we can check the validity of data past it.
-            f_read(active_file, &character, 1, &quantity_read);
+            // End of quoted key. Skip over the ending quote.
+            character = get_next_character(active_file);
            break;
-        } else if (!quoted && (unichar_isspace(character) || character == '=' || character == '\n' || character == '#')) {
+        } else if (!quoted && (unichar_isspace(character) || character == '=' || character == '\n' || character == '#' || character == 0)) {
+            // End of unquoted key.
            break;
        } else {
-            matches = matches && key[key_pos] == character;
-            escaped = false;
-            key_pos++;
+            // Still on tentative key; see if it matches the next supplied key character,
+            // but don't run off the end of the supplied key.
+            if (key_pos < key_len) {
+                matches = matches && key[key_pos] == character;
+                escaped = false;
+                key_pos++;
+            } else {
+                // Key on line is too long.
+                matches = false;
+            }
        }
-
-        f_read(active_file, &character, 1, &quantity_read);
+        character = get_next_character(active_file);
    }
-    if (unichar_isspace(character)) {
-        character = consume_spaces(active_file);
-    }
-    if (character == '=' || character == '\n' || character == '#') {
-        // Rewind one so the value can find it.
+    if (character == '=' || character == '\n' || character == '#' || character == 0) {
+        // Rewind one so the value, if any, can be found.
        f_lseek(active_file, f_tell(active_file) - 1);
    } else {
        // We're followed by something else that is invalid syntax.
        matches = false;
    }
+
    return matches && key_pos == key_len;
 }

 STATIC bool next_line(FIL *active_file) {
-    uint8_t character = ' ';
-    UINT quantity_read = 1;
+    uint8_t character;
    bool quoted = false;
    bool escaped = false;
    // Track comments because they last until the end of the line.
    bool comment = false;
-    FRESULT result = FR_OK;
    // Consume all characters while quoted or others up to \n.
-    while (result == FR_OK && quantity_read > 0 && (quoted || character != '\n')) {
-        if (character == '#' || comment) {
+    do {
+        character = get_next_character(active_file);
+
+        if ((!quoted || character == '#') || comment) {
            // Comments consume any escaping.
            comment = true;
        } else if (!escaped) {
@ -112,33 +130,32 @@ STATIC bool next_line(FIL *active_file) {
        } else {
            escaped = false;
        }
-        result = f_read(active_file, &character, 1, &quantity_read);
-    }
-    return result == FR_OK && quantity_read > 0;
+    } while (character != 0 && (quoted || character != '\n'));
+
+    return character != 0;
 }

 STATIC mp_int_t read_value(FIL *active_file, char *value, size_t value_len) {
-    uint8_t character = ' ';
-    UINT quantity_read = 1;
-    // Consume spaces before =
-    character = consume_spaces(active_file);
+    uint8_t character;
+    // Consume spaces before "=", and get first character of interest.
+    character = consume_whitespace(active_file);
    if (character != '=') {
        if (character == '#' || character == '\n') {
            // Keys without an = after them are valid with the value None.
-            return 0;
+            return -1;
        }
        // All other characters are invalid.
        return -1;
    }
-    character = ' ';
    // Consume space after =
-    while (unichar_isspace(character) && quantity_read > 0) {
-        f_read(active_file, &character, 1, &quantity_read);
+    if (character != '#') {
+        // a # immediately after = is part of the value!
+        character = consume_whitespace(active_file);
    }
    bool quoted = false;
    if (character == '\'') {
        quoted = true;
-        f_read(active_file, &character, 1, &quantity_read);
+        character = get_next_character(active_file);
    }
    if (character == '"') {
        // We don't support double quoted values.
@ -150,12 +167,13 @@ STATIC mp_int_t read_value(FIL *active_file, char *value, size_t value_len) {
    // Count trailing spaces so we can ignore them at the end of unquoted
    // values.
    size_t trailing_spaces = 0;
-    while (quantity_read > 0) {
+    bool first_char = true;
+    while (character != 0) {
        // Consume the first \ if the value is quoted.
        if (quoted && character == '\\' && !escaped) {
            escaped = true;
-            // Drop this slash by short circuiting the rest of the loop.
-            f_read(active_file, &character, 1, &quantity_read);
+            // Drop this backslash by short circuiting the rest of the loop.
+            character = get_next_character(active_file);
            continue;
        }
        if (quoted && !escaped && character == '\'') {
@ -163,7 +181,7 @@ STATIC mp_int_t read_value(FIL *active_file, char *value, size_t value_len) {
            break;
        }
        // Unquoted values are ended by a newline or comment.
-        if (!quoted && (character == '\n' || character == '#')) {
+        if (!quoted && (character == '\n' || (character == '#' && !first_char))) {
            if (character == '\n') {
                // Rewind one so the next_line can find the \n.
                f_lseek(active_file, f_tell(active_file) - 1);
@ -182,7 +200,8 @@ STATIC mp_int_t read_value(FIL *active_file, char *value, size_t value_len) {
            value[value_pos] = character;
        }
        value_pos++;
-        f_read(active_file, &character, 1, &quantity_read);
+        character = get_next_character(active_file);
+        first_char = false;
    }

    return value_pos - trailing_spaces;
@ -214,7 +233,7 @@ mp_obj_t common_hal_dotenv_get_key(const char *path, const char *key) {
    // the length.
    char value[64];
    mp_int_t actual_len = dotenv_get_key(path, key, value, sizeof(value));
-    if (actual_len <= 0) {
+    if (actual_len < 0) {
        return mp_const_none;
    }
    if ((size_t)actual_len >= sizeof(value)) {
--- a/tests/circuitpython-manual/dotenv_test.env
+++ b/tests/circuitpython-manual/dotenv_test.env
@ -0,0 +1,32 @@
+# No e0 value
+   # comment preceded by spaces
+e1=e1value
+e2=e2value # value followed by a comment
+e3='e3value'
+e4='e4value' # quoted value followed by a comment
+# e5 should be None
+e5
+# e6 should be the empty string
+e6=
+# e7 should be '#' (bash-like syntax processing)
+e7=#
+# e8 should be the empty string
+e8=''
+# e9 should be the empty string
+e9= #
+e10=e10_first
+e10=e10_last
+e11='abc#def'
+# e12 should be 'abc#def'
+e12=abc#def
+e12='multi
+line'
+e13=e13value
+e14  #comment
+e15 = e15value
+# e16 should be '#'
+e16=# #
+# e17 should be 'def#hi'
+e17='def'#hi
+# e18 should be '#has a hash'
+e18=#has a hash
--- a/tests/circuitpython-manual/dotenv_test.py
+++ b/tests/circuitpython-manual/dotenv_test.py
@ -0,0 +1,23 @@
+import dotenv
+
+FILE = "dotenv_test.env"
+
+print("e0", dotenv.get_key(FILE, "e0"))
+print("e1", dotenv.get_key(FILE, "e1"))
+print("e2", dotenv.get_key(FILE, "e2"))
+print("e3", dotenv.get_key(FILE, "e3"))
+print("e4", dotenv.get_key(FILE, "e4"))
+print("e5", dotenv.get_key(FILE, "e5"))
+print("e6", dotenv.get_key(FILE, "e6"))
+print("e7", dotenv.get_key(FILE, "e7"))
+print("e8", dotenv.get_key(FILE, "e8"))
+print("e9", dotenv.get_key(FILE, "e9"))
+print("e10", dotenv.get_key(FILE, "e10"))
+print("e11", dotenv.get_key(FILE, "e11"))
+print("e12", dotenv.get_key(FILE, "e12"))
+print("e13", dotenv.get_key(FILE, "e13"))
+print("e14", dotenv.get_key(FILE, "e14"))
+print("e15", dotenv.get_key(FILE, "e15"))
+print("e16", dotenv.get_key(FILE, "e16"))
+print("e17", dotenv.get_key(FILE, "e17"))
+print("e18", dotenv.get_key(FILE, "e18"))