commit 4482c6260cd8dbe9c40545d0e5fb61f2444c3792
Author: rani <rani@buni.party>
Date: Sat Jan 31 17:38:03 2026 +0000
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..a4cf8e5
--- /dev/null
+++ b/Makefile
@@ -00 +123 @@
+BUILDDIR=../site
+DPP?= dpp
+
+SRC= index.html \
+ error/404.html \
+ blog/index.html \
+ blog/paragon/index.html
+
+TEMPLATE=template/bar.html \
+ template/foot.html \
+ template/head.html \
+ template/readtime.html
+
+OUT= ${SRC:%=${BUILDDIR}/%}
+
+
+all: ${OUT}
+
+${BUILDDIR}/%: % ${TEMPLATE}
+ @mkdir -p ${@D}
+ DPP_BLOCK='!!' ${DPP} $< <$< >$@
+
+.PHONY: all
diff --git a/blog/index.html b/blog/index.html
new file mode 100644
index 0000000..5a38791
--- /dev/null
+++ b/blog/index.html
@@ -00 +135 @@
+!!
+$DPP "Bun Blog" "Welcome to the Bun Blog" <template/head.html # >
+$DPP blog <template/bar.html # >
+
+strip() {
+ temp="${@#\"}"
+ temp="${temp%\"}"
+ printf '%s\n' "$temp"
+}
+
+for post in blog/*; do
+ [ ! -d "$post" ] && continue
+ [ ! -e "$post/index.html" ] && continue
+
+ title=""
+ date=""
+
+ count=0
+ while IFS='=' read -r key val; do
+ [ $count -eq 2 ] && break
+ count=$((count + 1))
+
+ key="${key#\!\!}"
+ case "$key" in
+ title) title="$(strip "$val")" ;;
+ date) date="$(strip "$val")" ;;
+ esac
+ done <"$post/index.html" # >
+
+ printf '<p><a href="%s">%s</a> %s</p>\n' "${post##*/}" "$title" "$date"
+done
+!!
+
+
+!!$DPP <template/foot.html # >
diff --git a/blog/paragon/index.html b/blog/paragon/index.html
new file mode 100644
index 0000000..75a3acc
--- /dev/null
+++ b/blog/paragon/index.html
@@ -00 +1366 @@
+!!title="A Parallel Graphics Processor"
+!!date="December 13, 2025"
+!!
+$DPP "$title" "$title" <template/head.html # >
+$DPP <template/bar.html # >
+
+$DPP "$1" <template/readtime.html # >
+!!
+
+<p><a class="out" href="https://github.com/Raniconduh/paragon">PARAGON</a>
+(Parallel Architecture for Rendering and Graphics OperatioNs) is a graphics
+processor I designed for a class. Unlike some other class projects, this one was
+a lot of fun to build and so I've decided to write about it.</p>
+
+<h3>Overview</h3>
+<p>The final design consisted of 12 compute cores drawing to a 320x240
+framebuffer. The framebuffer stores 8 bit pixels (RGB 332) and each compute core
+accesses a single pixel at a time. Internally, the compute cores are a (rather
+strange) 24 bit Harvard RISC architecture. The architecture was designed to be
+as minimal as possible while still being very usable. (More on this later.) Each
+core runs in parallel and, for the most part, does not need to know that any
+other core exists. The video memory is organized in a 3 level hierarchy, with
+two levels of caching and the framebuffer at the highest level. (Note, however,
+that the final design did not include the level 2 cache due to reasons
+below.)</p>
+
+<h3>Architecture</h3>
+<p>
+The compute cores are designed around a minimal RISC architecture with 16 bit
+fixed width instructions and 24 bit registers. The register file contains 16
+registers indexed 0 through 15 where register 0 is the zero register (sinking
+all writes and sourcing 0) and register 15 is the core ID register (sinking
+writes and sourcing the ID of the current core). The core ID register is what
+truly allows for parallelism, as otherwise each core would run the exact same
+instructions in the exact same order. (This can cause glitches depending on the
+kernel; more on this later.)
+</p>
+<p>
+To the untrained eye, a register which is always zero seems wasteful. If you
+need a zero, you can just load it in, right? The zero register is a hallmark of
+RISC architectures. In fact, the inclusion of this register makes the
+architecture smaller and simpler, although it also means the programmer has to
+be more aware of what they are doing. (In many cases, this can be handled by the
+assembler. e.g. <code>mov rd, rs</code> is translated as <code>add rd, r0, rs</code>.)</p>
+<p>
+Instructions have 4 bit opcodes, meaning there are only 16 possible
+instructions. Fortunately, this limit was not reached in the final design. Most
+instructions have 3 operands which is typical of RISC architectures; although,
+some instructions use fewer operands.
+</p>
+<p>
+The instruction set contains the following ALU operations: ADD, SUB, MUL, AND,
+OR, XOR, NOT, SH, ADDI. The first few are self explanatory. SH is the shift
+instruction which performs left, right, or arithmetic shifts on a register by an
+immediate (i.e. a constant which is not a register). ADDI is perhaps the most
+unusual instruction of the ALU operations as it is not strictly necessary.
+Simply, it adds an immediate to a register. Note, however, that the immediate is
+a signed 4 bit integer (ew). This is the only instruction to use a 4 bit signed
+immediate. Even though the instruction is absolutely not required, it makes
+writing programs much easier.
+</p>
+<p>
+The processor's memory operations are limited, loading and storing only 1 byte at
+a time. The LD and ST instructions are the only two memory operations and they
+are used to access video memory (pixelwise). Because there are no other memory
+operations, the astute may wonder how the program memory is accessed. The answer
+to this is that program memory cannot be accessed. It is read only and only ever
+touched by the control system (i.e. a programmer cannot read or write to it).
+The LDI instruction is in a similar vein, although it does not read from memory.
+Instead it loads an unsigned 8 bit immediate into a register.
+</p>
+<p>
+Finally, the architecture includes two control flow instructions. B is the
+branch instruction. It supports both conditional and unconditional branch and
+can either branch to a PC relative immediate or directly to an address stored in
+a register. Once again, some may recognize that something is missing. Two things
+actually. Mainly, there is no compare instruction. This seems to imply that all
+branching is unconditional. However, all ALU operations modify a set of flags:
+N, Z, and C. These flags store whether the output of the ALU is (N)egative (i.e.
+the MSB is set), (Z)ero, or if the operation generated a carryout. Using these
+flags, comparisons are typically created by subtracting two registers and
+discarding the result. The other thing that is missing is some sort of linked
+branch (i.e. function call). This is solved by the final control instruction
+AIPC, or add immediate to program counter. The instruction adds a signed 8 bit
+immediate to the program counter (PC) and stores it in a register. This is
+especially helpful for direct branches and e.g. function calls. The instruction
+is technically redundant since the PC is always known to the assembler, but it
+helps making functions easier.
+</p>
+
+<h4>Program Memory</h4>
+<p>
+As mentioned above, the program memory is used solely by the processor's control
+unit. Even though it is not as extensible as the video memory, the program
+memory is obviously a necessary part of the system. Separating the program
+memory (ROM) from the video memory (VRAM) also helped keep the architecture
+smaller and simpler (and what makes it a Harvard architecture).
+</p>
+<p>
+One of the most important parts of a graphics processor is having it run
+different kernels. (It wouldn't be very interesting if it could only ever do one
+thing.) Because of this, the program ROM is reprogrammable. By enabling the
+programming mode on the ROM, instructions can be streamed to the graphics
+processor where they will be stored in the same order. Once programming is
+complete, the processor begins execution, starting at the first instruction.
+</p>
+<p>
+But if the compute cores are independent of each other, how can they work with
+only one program ROM? Well, one solution to this is to just go core by core,
+handling each read request one at a time. Of course, this is horribly slow. The
+solution used by PARAGON is to copy the instruction ROM into multiple
+(individual) ROMs. Since the ROM is (as per the name) read only, there is no
+risk of processors modifying the instructions in the kernel. This means that
+every core can access the program ROM at the exact same time and not have to
+worry about waiting for other cores. On the other hand, this means that the
+program ROM is way bigger than it has to be. This is not a huge deal for PARAGON
+since the ROM is quite small (2048 instructions deep). I also believe that speed
+is a much more important factor in this case than memory utilization. (If read
+requests were handled one at a time, I'm inclined to believe that ROM would be
+12x slower, meaning there wouldn't really be much benefit to having more than 1
+core.)
+</p>
+
+<h4>Video Memory</h4>
+<p>
+The video memory (VRAM) is seemingly simple. Most of video memory (~60%) is
+allocated to the framebuffer (what you actually see on the screen). There is a
+bit of memory (~40%) beneath the framebuffer for general purpose use, although
+it is a bit limited since memory is accessed byte by byte. (Maybe in the future
+loads and stores can optionally work on multiple bytes.) Again, this doesn't
+seem too complex. But, what happens when multiple cores are trying to use VRAM
+at the same time? The program ROM solution can be applied here too. Just copy
+the VRAM for each core. This seems to keep VRAM fast like it did with the
+program ROM. Although, the program ROM was very small (4kB). Copying VRAM for 12
+cores would cause it to be huge and absolutely impossible to accomplish. The
+real solution (and the one employed by PARAGON) is to have a caching scheme and
+memory arbitration.
+</p>
+<p>
+The caching scheme is fairly simple (especially because of the way VRAM was
+actually implemented). Each core has a private write-though level 1 cache that
+stores 64 cache lines (each of which is 8 bytes wide; i.e. a total of 256 bytes
+of cache). The cache is directly mapped meaning indices in the cache are not
+unique. (This can lead to thrashing due to conflicting cache accesses, but I am
+treating it as a non issue for the sake of PARAGON.) When a core wants to access
+VRAM, the cache first checks if the memory has been cached already. If it has,
+great! On reads, just return the cache line. On writes, modify it and send it to
+the next level. When the memory has not yet been cached, the L1 cache requests
+the line from the next level and waits for it to come back. Although, what
+happens if a core writes to memory that is cached by another core? There are a
+lot of different coherency schemes that can be used to handle this but PARAGON
+employs the most simple: such cache lines are invalidated. This means that if
+one core tries to read cached memory that has been modified by a different core,
+it will have to fetch the modified line from the next level.
+</p>
+<p>
+In the final design of PARAGON, the memory hierarchy does not include a level
+two cache. Instead, the memory is directly arbitrated between the private L1
+caches and the full VRAM. The arbitration is fairly simple. The memory arbiter
+keeps track of the last core that it granted a request to. It then tries to find
+requests starting at the next core before wrapping back around. This means that
+one core will never be able to hold the memory arbiter forever.
+</p>
+
+<h3>Okay... Does it Even Work?</h3>
+<p>
+Yes! PARAGON was implemented on a Xilinx Spartan 7 FPGA on the RealDigital
+<a class="out" href="https://www.realdigital.org/hardware/urbana">Urbana Board</a>
+development board. The Spartan 7 is a decent low end FPGA with a fair bit of
+resources. Originally, I thought that my limiting factor would be LUTs. However,
+once I actually built the thing, I quickly found out that I was mostly limited
+by the RAM. The Spartan 7 (XC7S50) contains 75 RAMB36 blocks. These are
+synchronous, dual port, 36 kilobit SRAM blocks. There are four ways in which RAM
+is used by the processor: program memory, cache memory, video memory, and CPU
+memory. I haven't mentioned anything about a CPU insofar but it is a necessary
+part of the graphics processor. Of course, you wouldn't have a computer with a
+GPU but no CPU. The CPU used in the design is the Xilinx MicroBlaze processor. I
+stripped out as much as I could from it to make it small (and slow; the CPU
+performance is not an integral part of the graphics processor). The MicroBlaze
+seems to require about 16KB of memory including all the graphics demos, although
+I allocated 64KB just to be safe. All in all, the biggest memory consumer is the
+VRAM. This is because it is implemented entirely on the on-chip memory. The dev
+board does come with 128Mbits of DDR3 memory, but I had a lot of difficulty
+trying to get it to work and I eventually scrapped that idea. The upside to
+using the on-chip memory is that it is very fast: the memory has a 1 clock cycle
+read latency, much faster than however many clock cycles it would take for DDR3
+to be ready. This is what led to the omission of an L2 cache. If the large, slow
+DDR3 had been used, the L2 cache would likely be necessary.
+</p>
+<p>
+So how fast is it? Well, each compute core runs at 50MHz. Originally the plan
+was to run them at 100MHz but that has proven to be impossible for two reasons.
+Firstly, the processor cores are not particularly efficient; that is, they are
+not pipelined and expect all data to be ready in the same clock cycle. This is
+not scalable, however, and initially caused me to lower the clock frequency to
+50MHz. (This might seem like an oversight on my part, but remember that this was
+a class project and I was on a time limit. Pipelined processor design is not the
+easiest and would greatly increase the development time.) The other limiting
+factor is physical. The processor uses logic cells from all over the chip.
+Physically, routing signals from one corner of the chip to the next is slow. I'm
+sure I could bump up the frequency a little but 50MHz is a nice round number.
+</p>
+
+<h3>Demos</h3>
+<p>
+I designed a bunch of demos to showcase PARAGON's processing power. Since the
+architecture contains only integer arithmetic and no special graphics
+operations, kernels are limited to fixed point arithmetic and manually writing
+out necessary functions (like matrix multiplication or dot product). (Floating
+point is technically possible using a software floating point library, but I
+haven't bothered to try implementing one as it is likely very difficult.)
+Surprisingly, even though kernels were limited to fixed point, I was still able
+to get some good looking demos.
+</p>
+<p>
+The following video shows the processor running the Rule 30 elementary cellular
+automaton. The simulation is artificially slowed and runs on only a single core.
+Yes, this is intentional. The simulation was otherwise way too fast and did not
+look good.
+</p>
+<div class="video"><video width="320" height="240" controls>
+ <source src="content/rule30.webm" type=video/webm>
+ Rule 30 Elementary Cellular Automaton Simulation
+</video></div>
+<p>
+You might notice the strange purple bar on the left side of the screen. This is
+caused by the (soft) HDMI encoder that is used to generate the video output. Can
+this be fixed? Probably. Is it a big deal? I don't think so. And so it stays.
+The demo looks cool (in my opinion) but it doesn't showcase the real power of
+the processor. For that, I wrote a Mandelbrot set rendering kernel. The
+Mandelbrot set is an "image" that is found by computing the rate of divergence
+of the equation <code>z_next = z_prev^2 + c</code>. I don't want to talk too
+much about the math, but the kernel just computes the formula a bunch of times
+and sees when (or if) the variable eventually blows up (i.e. gets really big in
+magnitude). The demo shows the computation twice: once on a single core and
+again but using all 12 cores.
+</p>
+<div class="video"><video width="320" height="240" controls>
+ <source src="content/mandelbrot.webm" type=video/webm>
+ Single Core and Multi Core Mandelbrot Set Rendering
+</video></div>
+<p>
+Clearly, the single core rendering is very slow while the multicore rendering is
+quite fast. Even though cores still have to wait their turn to write to memory,
+the heavy computation involved means that the memory bus sits mostly idle and so
+there are few conflicts between cores.
+</p>
+<p>
+Another interesting demo is Conway's Game of Life. The GoL is another cellular
+automaton but produces more interesting patterns than Rule 30. The demo uses
+Rule 30 as a pseudorandom initial state and then simulates the Game of Life.
+</p>
+<div class="video"><video width="320" height="240" controls>
+ <source src="content/gol.webm" type=video/webm>
+ Conway's Game of Life Simulation
+</video></div>
+<p>
+The demo runs the Game of Life on multiple cores. This is not because the
+simulation is computationally complex, but instead because it requires heavy
+memory usage. For each pixel, the kernel fetches the 8 surrounding pixels to
+compute the next state which is stored in the right half of the framebuffer.
+Once all cells have been computed, the next state buffer is copied back to the
+left half of the framebuffer so that the next computation can begin. I have also
+run this demo on a single core and it is abysmally slow. So, even though it is a
+memory heavy kernel, the cores are still able to share the memory bus
+effectively, thereby increasing the simulation speed.
+</p>
+<p>
+The final interesting demo is only half baked (visually and computationally, but
+still interesting). It rotates the vertices of a cube by applying a rotation
+matrix to them. The demo is also artifically slowed and single core since it did
+not look good at true computational speed.
+</p>
+<div class="video"><video width="320" height="240" controls>
+ <source src="content/cube.webm" type=video/webm>
+ A 3 Dimensional Rotating Cube
+</video></div>
+<p>
+Again, due to time constraints, I did not make the cube visually appealing
+(edges or face coloring). However, the video clearly shows a rotating cube.
+Eventually, the cube begins to fall apart and this is due to inaccuracy in the
+rotation matrix and vertex coordinates. Because they are fixed point, only so
+much accuracy can be encoded. To preserve some accuracy, the matrix values were
+precomputed and somewhat optimized (I checked a lot of different values by hand
+and the one in this demo is one of the best ones I found). The kernel does not
+re-normalize the vertices, even though it should, once again due to time
+constraints.
+</p>
+<p>
+The rotating cube demo is also single core for another reason (which is hinted
+at above). The cube's vertices are being stored in VRAM right below the
+framebuffer (so they are not visible in the video output). Because of this, if
+multiple cores are trying to read the vertices to draw them to the screen and
+then modify the vertices and store them back, the coordinates will end up being
+corrupted and too many vertices will be drawn to the screen. The real fix to
+this is to have each core work on an independent set of vertices and synchronize
+the cores before clearing the screen. I did not elect to do this since it adds
+additional complexity and still would look bad due to being much too fast.
+</p>
+<p>
+Below are some more fun renderings.
+</p>
+
+<table class="images"><tbody>
+ <tr>
+ <td class="image"><img src="content/blurbrot.jpg" width="320" height="240"></td>
+ <td class="image"><img src="content/blurrule30.jpg" width="320" height="240"></td>
+ </tr>
+ <tr>
+ <td class="caption">Blurred Mandelbrot Set</td>
+ <td class="caption">Blurred Rule 30 Simulation</td>
+ </tr>
+ <tr>
+ <td class="image"><img src="content/julia1.jpg" width="320" height="240"></td>
+ <td class="image"><img src="content/julia2.jpg" width="320" height="240"></td>
+ </tr>
+ <tr>
+ <td class="caption">Julia Set 1</td>
+ <td class="caption">Julia Set 2</td>
+ </tr>
+ <tr>
+ <td class="image"><img src="content/julia3.jpg" width="320" height="240"></td>
+ <td class="image"><img src="content/ship.jpg" width="320" height="240"></td>
+ </tr>
+ <tr>
+ <td class="caption">Julia Set 3</td>
+ <td class="caption">Burning Ship Fractal</td>
+ </tr>
+</tbody></table>
+
+<p>
+All these demos were painstakingly written by hand in PARAGON assembly.
+</p>
+
+<h3>What Now?</h3>
+<p>
+Even though it clearly works, PARAGON is lacking some features. The inclusion of
+large DDR3 memory would be very useful as it would allow for an increased
+framebuffer resolution and also add a ton more memory for general purpose use
+(e.g. double buffering). Since VRAM is byte addressable (at least as far as the
+compute cores are concerned), it is difficult (or at least annoying) to store
+large 24 bit values from the registers into memory. One fix to this might be to
+add a private scratchpad where cores may store general purpose memory, function
+call stacks, etc. Currently, recursion or functions calling functions is very
+difficult to implement due to the byte addressing limitation.
+</p>
+<p>
+The processors could also use some form of hardware synchronization. A simple
+solution is adding a SYNC instruction. Of course, this instruction could be
+shared with other instructions to form a set of processor control instructions.
+Hardware synchronization would allow for writing kernels where cores must
+interact with each other but will likely diverge in execution without using
+clunky software synchronization.
+</p>
+<p>
+Fixed point arithmetic is horrible to work with. Poor precision, rounding
+errors, etc. The addition of a floating point unit would greatly increase the
+ability to write high quality, high precision kernels. Of course, FPUs are huge,
+so there would likely only be one or two shared between all the cores. Yes, this
+would be slow, but it's worth having high precision arithmetic.
+</p>
+<p>
+All in all, I think I'm happy with how this turned out.
+</p>
+
+!!$DPP <template/foot.html # >
diff --git a/error/404.html b/error/404.html
new file mode 100644
index 0000000..42e22a3
--- /dev/null
+++ b/error/404.html
@@ -00 +18 @@
+!!
+$DPP "404 Page Not Found" "Welcome to the Bun Shack" <template/head.html # >
+$DPP <template/bar.html # >
+!!
+
+<h1>404</h1>
+
+!!$DPP <template/foot.html # >
diff --git a/index.html b/index.html
new file mode 100644
index 0000000..f0cf48b
--- /dev/null
+++ b/index.html
@@ -00 +147 @@
+!!
+$DPP "Bun Shack" "Welcome to the Bun Shack" <template/head.html # >
+$DPP home <template/bar.html # >
+!!
+
+<link rel="stylesheet" href="/syntax.css">
+<pre><code class="language-c">
+<span class="preproc">#include</span> <span class="delimiter"><stdio.h></span>
+
+<span class="type">int</span> <span class="function">main</span>(<span class="type">void</span>) {
+ <span class="function">puts</span>(<span class="string">"Hello World!"</span>);
+ <span class="keyword">return</span> <span class="number">0</span>;
+}
+
+</code></pre>
+
+<h3>Links</h3>
+<p><a class="out" href="https://github.com/Raniconduh">Github</a></p>
+
+<h3>Projects</h3>
+
+<table>
+ <tbody>
+ <tr>
+ <td><a class="out" href="https://github.com/Raniconduh/paragon">paragon</a></td>
+ <td><a href="blog/paragon">A parallel graphics processor</a> with working hardware, assembler, and emulator</td>
+ </tr>
+ <tr>
+ <td><a class="out" href="https://github.com/Raniconduh/shrimp">shrimp</a></td>
+ <td>A custom 16 bit microprocessor</td>
+ </tr>
+ <tr>
+ <td><a class="out" href="https://github.com/Raniconduh/shrimpas">shrimpas</a></td>
+ <td>Simple assembler for 16 bit SHRIMP</td>
+ </tr>
+ <tr>
+ <td><a class="out" href="https://github.com/Raniconduh/wish">wish</a></td>
+ <td>Fairly usable shell, with variables, pipes, and more!</td>
+ </tr>
+ <tr>
+ <td><a class="out" href="https://github.com/Raniconduh/games">games</a></td>
+ <td>A collection of small, simple games</td>
+ </tr>
+ </tbody>
+</table>
+
+!!$DPP <template/foot.html # >
diff --git a/style.css b/style.css
new file mode 100644
index 0000000..4ee9a69
--- /dev/null
+++ b/style.css
@@ -00 +170 @@
+* {
+ background-color: #282828;
+}
+
+body {
+ color: #EEEEEE;
+ padding-bottom: 3vh;
+}
+
+#body {
+ max-width: 800px;
+ margin: 0 auto;
+ padding: 20px;
+}
+
+p {
+ padding-bottom: 15px;
+ text-align: justify;
+ font-size: 16px;
+}
+
+thead {
+ text-align: left;
+}
+
+td {
+ padding-right: 15px;
+}
+
+table.images {
+ margin-left: auto;
+ margin-right: auto;
+ text-align: center;
+}
+
+td.caption {
+ padding-bottom: 15px;
+}
+
+a:link {
+ color: #B8EAEA;
+}
+
+a:visited {
+ color: #A9CACA;
+}
+
+a.out {
+ color: #AC5645;
+ text-decoration: none;
+ border-bottom: 1px dashed;
+}
+
+#topbar td {
+ padding-right: 25px;
+ padding-bottom: 15px;
+}
+
+#topbar a:link, #topbar a:visited {
+ color: #B8EAEA;
+ text-decoration: none;
+}
+
+#topbar a.active {
+ font-weight: bold;
+}
+
+.video {
+ text-align: center;
+}
diff --git a/syntax.css b/syntax.css
new file mode 100644
index 0000000..63e367b
--- /dev/null
+++ b/syntax.css
@@ -00 +121 @@
+.preproc {
+ color: #A6E22E;
+}
+.delimiter {
+ color: #8F8F8F;
+}
+.string {
+ color: #E6DB74;
+}
+.type {
+ color: #66D9EF;
+}
+.keyword {
+ color: #F92672;
+}
+.number {
+ color: #AE81FF;
+}
+.function {
+ color: #A6E22E;
+}
diff --git a/template/bar.html b/template/bar.html
new file mode 100644
index 0000000..10099cb
--- /dev/null
+++ b/template/bar.html
@@ -00 +113 @@
+!!
+HOME=""
+BLOG=""
+case "${1:-}" in
+ home) HOME=active ;;
+ blog) BLOG=active ;;
+esac
+!!
+
+<table id="topbar"><tbody><tr>
+ <td><a class="${HOME}" href="/">Home</a></td>
+ <td><a class="${BLOG}" href="/blog">Blog</a></td>
+</tr></tbody></table>
diff --git a/template/foot.html b/template/foot.html
new file mode 100644
index 0000000..824fe2e
--- /dev/null
+++ b/template/foot.html
@@ -00 +12 @@
+</div></body>
+</html>
diff --git a/template/head.html b/template/head.html
new file mode 100644
index 0000000..cba3c59
--- /dev/null
+++ b/template/head.html
@@ -00 +114 @@
+!!: ${1:?Usage: head.html title heading}
+!!: ${2:?Usage: head.html title heading}
+
+<!DOCTYPE html>
+<html lang="en">
+ <head>
+ <meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
+ <link rel="stylesheet" href="/style.css">
+ <title>$1</title>
+ <meta property="og:title" content="$1">
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+ </head>
+<body><div id="body">
+ <h2>$2</h2>
diff --git a/template/readtime.html b/template/readtime.html
new file mode 100644
index 0000000..bd3a896
--- /dev/null
+++ b/template/readtime.html
@@ -00 +133 @@
+!!
+: ${1:?Usage: readtime.html path}
+
+words="$(awk '
+BEGIN { skip = 0 }
+
+/^!!.+$/ { next }
+
+/^!!$/ {
+ skip = !skip
+ next
+}
+
+!skip {
+ gsub(/<[^>]*>/ , "")
+ if (NF) print
+}
+' "$1" | wc -w)"
+rate=200 # wpm
+
+# round up the number of minutes
+minutes=$(( (words + (rate - 1)) / rate ))
+
+if [ $minutes -ge 60 ]; then
+ hours=$(( minutes / 60 ))
+ minutes=$(( minutes % 60 ))
+ time="$hours hour and $minutes minute"
+else
+ time="$minutes minute"
+fi
+!!
+
+<p><small><b>${time} read</b></small></p>