AssemblyScript · geraintluff · Jan 13, 2026 · Jan 13, 2026 · Jan 15, 2026 · Jan 15, 2026
diff --git a/NOTICE b/NOTICE
@@ -62,6 +62,7 @@ under the licensing terms detailed in LICENSE:
 * Kam Chehresa <kaz.che@gmail.com>
 * Mopsgamer <79159094+Mopsgamer@users.noreply.github.com>
 * EDM115 <github@edm115.dev>
+* Geraint Luff <luffgd@gmail.com>
 
 Portions of this software are derived from third-party works licensed under
 the following terms:

diff --git a/src/builtins.ts b/src/builtins.ts
@@ -749,6 +749,7 @@ export namespace BuiltinNames {
   export const memory_copy = "~lib/memory/memory.copy";
   export const memory_fill = "~lib/memory/memory.fill";
   export const memory_data = "~lib/memory/memory.data";
+  export const memory_dataUTF8 = "~lib/memory/memory.dataUTF8";
 
   // std/typedarray.ts
   export const Int8Array = "~lib/typedarray/Int8Array";
@@ -3491,6 +3492,78 @@ function builtin_memory_data(ctx: BuiltinFunctionContext): ExpressionRef {
 }
 builtinFunctions.set(BuiltinNames.memory_data, builtin_memory_data);
 
+function utf16ToUtf8(str: string) : Uint8Array {
+  let result = new Uint8Array(str.length*3); // max possible length is 1.5x the UTF16 encoding
+  let utf8Length: i32 = 0; // track actual encoded length
+  for (let i: i32 = 0; i < str.length; ++i) {
+    // UTF16 decode
+    let codePoint: u32 = str.charCodeAt(i);
+    if (codePoint >= 0xD800 && codePoint < 0xDC00) {
+      // could be the first half of a surrogate pair (if)
+      let codePoint2: u32 = str.charCodeAt(i + 1);
+      if (i + 1 < str.length && codePoint2 >= 0xDC00 && codePoint2 < 0xE000) {
+        // valid surrogate pair - combine to get the code-point
+        codePoint = ((codePoint&0x3FF)<<10) + (codePoint2&0x3FF) + 0x10000;
+        ++i;
+      }
+    }
+    // UTF8 encode
+    if (codePoint < 0x0080) {
+      result[utf8Length++] = codePoint;
+    } else if (codePoint < 0x0800) {
+      result[utf8Length++] = 0xC0 + (codePoint>>6);
+      result[utf8Length++] = 0x80 + (codePoint&0x3F);
+    } else if (codePoint < 0x10000) {
+      result[utf8Length++] = 0xE0 + ((codePoint>>12)&0x0F);
+      result[utf8Length++] = 0x80 + ((codePoint>>6)&0x3F);
+      result[utf8Length++] = 0x80 + (codePoint&0x3F);
+    } else {
+      result[utf8Length++] = 0xF0 + ((codePoint>>18)&0x07);
+      result[utf8Length++] = 0x80 + ((codePoint>>12)&0x3F);
+      result[utf8Length++] = 0x80 + ((codePoint>>6)&0x3F);
+      result[utf8Length++] = 0x80 + (codePoint&0x3F);
+    }
+  }
+  let trimmed = new Uint8Array(utf8Length);
+  for (let i: i32 = 0; i < utf8Length; ++i) trimmed[i] = result[i];
+  return trimmed;
-  let trimmed = new Uint8Array(utf8Length);
-  for (let i: i32 = 0; i < utf8Length; ++i) trimmed[i] = result[i];
-  return trimmed;
+  return result.subarray(0, utf8Length);
-  let trimmed = new Uint8Array(utf8Length);
-  for (let i: i32 = 0; i < utf8Length; ++i) trimmed[i] = result[i];
-  return trimmed;
+  return result.subarray(0, utf8Length);
+}
+
+// memory.dataUTF8(value) -> usize
+function builtin_memory_dataUTF8(ctx: BuiltinFunctionContext): ExpressionRef {
+  let compiler = ctx.compiler;
+  let module = compiler.module;
+  if (
+    checkTypeAbsent(ctx) |
+    checkArgsRequired(ctx, 1)
+  ) return module.unreachable();
+  let operands = ctx.operands;
+  let usizeType = compiler.options.usizeType;
+  let offset: i64;
+  let arg0 = operands[0];
+  if (!arg0.isLiteralKind(LiteralKind.String)) {
+    compiler.error(
+      DiagnosticCode.String_literal_expected,
+      arg0.range
+    );
+    return module.unreachable();
+  }
+  let str = (<StringLiteralExpression>arg0).value;
+  let array : Uint8Array = utf16ToUtf8(str);
+  let arrayNullTerminated = new Uint8Array(array.length + 1);
+  arrayNullTerminated.set(array);
+  offset = compiler.addAlignedMemorySegment(arrayNullTerminated, 1).offset;
+  // FIXME: what if recompiles happen? recompiles are bad.
+  compiler.currentType = usizeType;
+  if (usizeType == Type.usize32) {
+    assert(!i64_high(offset));
+    return module.i32(i64_low(offset));
+  } else {
+    return module.i64(i64_low(offset), i64_high(offset));
+  }
+}
+builtinFunctions.set(BuiltinNames.memory_dataUTF8, builtin_memory_dataUTF8);
+
 // === GC =====================================================================================
 
 function builtin_i31_new(ctx: BuiltinFunctionContext): ExpressionRef {

diff --git a/std/assembly/index.d.ts b/std/assembly/index.d.ts
@@ -1798,6 +1798,8 @@ declare namespace memory {
   export function data(size: i32, align?: i32): usize;
   /** Gets a pointer to a pre-initialized static chunk of memory. Alignment defaults to the size of `T`. Arguments must be compile-time constants. */
   export function data<T>(values: T[], align?: i32): usize;
+  /** Gets a pointer to a pre-initialized static chunk of memory containing null-terminated UTF8. Value must be a compile-time constant. */
+  export function dataUTF8(value: string): usize;
 
   export namespace atomic {
     /** Performs a wait operation on a 32-bit integer value in memory suspending this agent if the condition is met. */

diff --git a/std/assembly/memory.ts b/std/assembly/memory.ts
@@ -76,6 +76,11 @@ export namespace memory {
   // @ts-ignore: decorator
   @builtin
   export declare function data<T>(size: T, align?: i32): usize;
+
+  /** Gets a pointer to a null-terminated UTF8 constant in static memory. */
+  // @ts-ignore: decorator
+  @builtin
+  export declare function dataUTF8(str : string): usize;
 }
 
 // @ts-ignore: decorator

diff --git a/tests/compiler/memory.debug.wat b/tests/compiler/memory.debug.wat
@@ -4,9 +4,9 @@
  (type $2 (func (param i32 i32 i32 i32)))
  (import "env" "abort" (func $~lib/builtins/abort (param i32 i32 i32 i32)))
  (global $memory/ptr (mut i32) (i32.const 80))
- (global $~lib/memory/__data_end i32 (i32.const 212))
- (global $~lib/memory/__stack_pointer (mut i32) (i32.const 32980))
- (global $~lib/memory/__heap_base i32 (i32.const 32980))
+ (global $~lib/memory/__data_end i32 (i32.const 220))
+ (global $~lib/memory/__stack_pointer (mut i32) (i32.const 32988))
+ (global $~lib/memory/__heap_base i32 (i32.const 32988))
  (memory $0 1)
  (data $0 (i32.const 16) "\00\00\00\00")
  (data $1 (i32.const 28) ",\00\00\00\00\00\00\00\00\00\00\00\02\00\00\00\12\00\00\00m\00e\00m\00o\00r\00y\00.\00t\00s\00\00\00\00\00\00\00\00\00\00\00")
@@ -33,6 +33,8 @@
  (data $22 (i32.const 206) "\01")
  (data $23 (i32.const 207) "\01")
  (data $24 (i32.const 208) "\01")
+ (data $25 (i32.const 209) ":)\00")
+ (data $26 (i32.const 212) "\f0\9f\90\8c\00")
  (table $0 1 1 funcref)
  (elem $0 (i32.const 1))
  (export "memory" (memory $0))
@@ -475,6 +477,126 @@
    call $~lib/builtins/abort
    unreachable
   end
+  i32.const 209
+  global.set $memory/ptr
+  global.get $memory/ptr
+  i32.load8_u
+  i32.const 58
+  i32.eq
+  i32.eqz
+  if
+   i32.const 0
+   i32.const 48
+   i32.const 66
+   i32.const 1
+   call $~lib/builtins/abort
+   unreachable
+  end
+  global.get $memory/ptr
+  i32.const 1
+  i32.add
+  i32.load8_u
+  i32.const 41
+  i32.eq
+  i32.eqz
+  if
+   i32.const 0
+   i32.const 48
+   i32.const 67
+   i32.const 1
+   call $~lib/builtins/abort
+   unreachable
+  end
+  global.get $memory/ptr
+  i32.const 2
+  i32.add
+  i32.load8_u
+  i32.const 0
+  i32.eq
+  i32.eqz
+  if
+   i32.const 0
+   i32.const 48
+   i32.const 68
+   i32.const 1
+   call $~lib/builtins/abort
+   unreachable
+  end
+  i32.const 212
+  global.set $memory/ptr
+  global.get $memory/ptr
+  i32.load8_u
+  i32.const 240
+  i32.eq
+  i32.eqz
+  if
+   i32.const 0
+   i32.const 48
+   i32.const 71
+   i32.const 1
+   call $~lib/builtins/abort
+   unreachable
+  end
+  global.get $memory/ptr
+  i32.const 1
+  i32.add
+  i32.load8_u
+  i32.const 159
+  i32.eq
+  i32.eqz
+  if
+   i32.const 0
+   i32.const 48
+   i32.const 72
+   i32.const 1
+   call $~lib/builtins/abort
+   unreachable
+  end
+  global.get $memory/ptr
+  i32.const 2
+  i32.add
+  i32.load8_u
+  i32.const 144
+  i32.eq
+  i32.eqz
+  if
+   i32.const 0
+   i32.const 48
+   i32.const 73
+   i32.const 1
+   call $~lib/builtins/abort
+   unreachable
+  end
+  global.get $memory/ptr
+  i32.const 3
+  i32.add
+  i32.load8_u
+  i32.const 140
+  i32.eq
+  i32.eqz
+  if
+   i32.const 0
+   i32.const 48
+   i32.const 74
+   i32.const 1
+   call $~lib/builtins/abort
+   unreachable
+  end
+  global.get $memory/ptr
+  i32.const 4
+  i32.add
+  i32.load8_u
+  i32.const 0
+  i32.eq
+  i32.eqz
+  if
+   i32.const 0
+   i32.const 48
+   i32.const 75
+   i32.const 1
+   call $~lib/builtins/abort
+   unreachable
+  end
  )
  (func $~start
   call $start:memory

diff --git a/tests/compiler/memory.release.wat b/tests/compiler/memory.release.wat
@@ -14,6 +14,8 @@
  (data $22 (i32.const 1214) "\01")
  (data $23 (i32.const 1215) "\01")
  (data $24 (i32.const 1216) "\01")
+ (data $25 (i32.const 1217) ":)")
+ (data $26 (i32.const 1220) "\f0\9f\90\8c")
  (export "memory" (memory $0))
  (start $~start)
  (func $start:memory
@@ -256,6 +258,102 @@
   global.set $memory/ptr
   i32.const 1215
   global.set $memory/ptr
+  i32.const 1217
+  global.set $memory/ptr
+  i32.const 1217
+  i32.load8_u
+  i32.const 58
+  i32.ne
+  if
+   i32.const 0
+   i32.const 1056
+   i32.const 66
+   i32.const 1
+   call $~lib/builtins/abort
+   unreachable
+  end
+  global.get $memory/ptr
+  i32.load8_u offset=1
+  i32.const 41
+  i32.ne
+  if
+   i32.const 0
+   i32.const 1056
+   i32.const 67
+   i32.const 1
+   call $~lib/builtins/abort
+   unreachable
+  end
+  global.get $memory/ptr
+  i32.load8_u offset=2
+  if
+   i32.const 0
+   i32.const 1056
+   i32.const 68
+   i32.const 1
+   call $~lib/builtins/abort
+   unreachable
+  end
+  i32.const 1220
+  global.set $memory/ptr
+  i32.const 1220
+  i32.load8_u
+  i32.const 240
+  i32.ne
+  if
+   i32.const 0
+   i32.const 1056
+   i32.const 71
+   i32.const 1
+   call $~lib/builtins/abort
+   unreachable
+  end
+  global.get $memory/ptr
+  i32.load8_u offset=1
+  i32.const 159
+  i32.ne
+  if
+   i32.const 0
+   i32.const 1056
+   i32.const 72
+   i32.const 1
+   call $~lib/builtins/abort
+   unreachable
+  end
+  global.get $memory/ptr
+  i32.load8_u offset=2
+  i32.const 144
+  i32.ne
+  if
+   i32.const 0
+   i32.const 1056
+   i32.const 73
+   i32.const 1
+   call $~lib/builtins/abort
+   unreachable
+  end
+  global.get $memory/ptr
+  i32.load8_u offset=3
+  i32.const 140
+  i32.ne
+  if
+   i32.const 0
+   i32.const 1056
+   i32.const 74
+   i32.const 1
+   call $~lib/builtins/abort
+   unreachable
+  end
+  global.get $memory/ptr
+  i32.load8_u offset=4
+  if
+   i32.const 0
+   i32.const 1056
+   i32.const 75
+   i32.const 1
+   call $~lib/builtins/abort
+   unreachable
+  end
  )
  (func $~start
   call $start:memory

diff --git a/tests/compiler/memory.ts b/tests/compiler/memory.ts
@@ -59,3 +59,17 @@ assert(ptr + 4 == (ptr = memory.data<u8>([1], 4)));
 assert(ptr + 2 == (ptr = memory.data<u8>([1], 2)));
 assert(ptr + 1 == (ptr = memory.data<u8>([1], 1)));
 assert(ptr + 1 == memory.data<u8>([1], 16));
+
+// Should correctly encode strings to UTF-8
+
+ptr = memory.dataUTF8(":)");
+assert(load<u8>(ptr) == 0x3A);
+assert(load<u8>(ptr + 1) == 0x29);
+assert(load<u8>(ptr + 2) == 0);
+
+ptr = memory.dataUTF8("🐌");
+assert(load<u8>(ptr) == 0xF0);
+assert(load<u8>(ptr + 1) == 0x9F);
+assert(load<u8>(ptr + 2) == 0x90);
+assert(load<u8>(ptr + 3) == 0x8C);
+assert(load<u8>(ptr + 4) == 0x00);