Saturday, 5 May 2012

ShelobPy now usable!

https://bitbucket.org/swbsystems/shelobpy 

 My python based document text extraction code is now fairly usable. So far it reads:
  • Word doc files (The recent Word 97-2003 format as well as Word 95)
  • Word docx files (Word 2007 onwards)
  • Microsoft Works wps files
  • Open Office odt files
  • PDF files
  • Rich Text rtf files
  • HTML files (also seems OK with the awful Word HTML files)
There will be probably be instances where the code chokes or where it doesn't pull out what it should, but any instances of this or suggestions would be welcome.

It uses pyPdf as an external dependency. I'll have a go at the pdf format myself when I get a chance, but for now pyPdf is an easily accessible and stable package so I'll use it  

Friday, 27 April 2012

Reading Microsoft Word Doc Files in Python

Thought I'd post this although it's far from finished. I've been trying to pull text data from various formats using as few external libraries as I can.

The word document format (Word 95 up to Word 2003) is a pretty complex format, especially when compared to the Microsoft Works WPS format (code to read this has been posted previously - this code also has the OleDocument class needed for the code below).

Basic code follows below, but I need to stress that this code is NOT 100% yet. The way I deal with text, both 16 and 8 bit isn't great and I'm sure there's more nasties that the format can add to the text as well as how it encodes tables and hyperlinks. As I'm going to have to leave this code for a while before I improve it (other priorities) I thought I'd post it as-is in case I forget to do so later.

#!/usr/bin/env python
import sys
import os
import re
import struct

DOCNONASCIIPATTERN8 = re.compile(r"[\x7F-\xFF]")
DOCNONASCIIPATTERN16 = re.compile(ur"[\u007F-\uFFFF]")
DOCTABLECLEAN = re.compile(r"[\x01-\x08]")
DOCSTRIPPATTERN = re.compile(r"\r")
DOCHYPERLINKPATTERN = re.compile(
    r"\x13.*HYPERLINK.*\"(?P<uri>.*)\".*\x14(?P<display>.*)\x15")

from OleDocument import OleDocument, ReaderError

class DOCReader(object):
   
    def __init__(self, file_name):
        self.file_name = file_name
        self.document = OleDocument(file_name)
        self.non_ascii_pattern8 = DOCNONASCIIPATTERN8
        self.non_ascii_pattern16 = DOCNONASCIIPATTERN16
        self.table_cleanup = DOCTABLECLEAN
        self.strip_pattern = DOCSTRIPPATTERN
        self.hyperlink_pattern = DOCHYPERLINKPATTERN
        self.file_version = "Unknown Version"
               
    def extract_text(self):
        #First we need to pull out the WordDocument stream
        #THis has most of the data we need
        doc_stream = self.document.read_stream("WordDocument")
        #The magic and version words define what version document this is
        #We dont handle pre-word 6 documents
        magic, version, flags = struct.unpack("<HH6xH", doc_stream[:12])
        if magic != 0xA5EC and magic != 0xA5DC:
            raise ReaderError("Invalid format - not a Word doc file")
        if version < 101:
            raise ReaderError("Very old doc file - cant handle before Word 95")
        elif version == 101 or version in range(103, 105):
            self.file_version = "Word 95"
            buff = self._process_word95(doc_stream)
        elif version >= 193:
            self.file_version = "Word 97 - 2003"
            buff = self._process_word97(doc_stream, flags)
        else:
            raise ReaderError("Unknown version of Word")
        return buff
   
    def _clean_hyperlinks(self, buff):
        #Word marks up hyperlinks with a certain markup.
        #We want to strip this out, pull out the hyperlink text and uri,
        # then add this to the text
        for match in self.hyperlink_pattern.finditer(buff):
            uri, display = match.groups()
            buff = self.hyperlink_pattern.sub("%s (link: %s)" % (display, uri),
                                              buff, 1)
        return buff
   
    def _process_word95(self, doc_stream):
        #This version is so much easier to handle!
        #The text start offset and end offset are early on in the stream.
        #Pull them out, try clean up the text (seems to be ascii) and thats it
        text_start, text_end = struct.unpack_from("<II", doc_stream, 0x18)
        buff = doc_stream[text_start:text_end]
        buff = self.non_ascii_pattern8.sub("", buff)
        buff = self.table_cleanup.sub(" ", unicode(buff , "utf8"))
        buff = self._clean_hyperlinks(buff)
        return self.strip_pattern.sub("\r\n", buff)
       
    def _process_word97(self, doc_stream, flags):
        #This is where it gets ugly!
        #Depending on the flags, you need to pull out another stream
        #Its almost always '1Table'
        if flags & 0x40:
            table_stream_name = "1Table"
        else:
            table_stream_name = "0Table"
        #Now, from the WordDocument stream pull out the size of the text
        #If there's any text in headers etc... then we need to add the extra
        # amount of text along with 1 extra char (Dont know why the extra 1!!!)
        offset = 62
        count = struct.unpack_from("<H", doc_stream, offset)[0]
        offset += 2
        text_size, foot_size, header_size, macro_size, annotation_size, \
            endnote_size, textbox_size, headertextbox_size = \
            struct.unpack_from("12x8I", doc_stream, offset)
        #If any sizes other than text size are non zero, add them up and add 1
        if foot_size or header_size or macro_size or annotation_size or \
                endnote_size or textbox_size or headertextbox_size:
            final_cp = text_size + foot_size + header_size + macro_size + \
                annotation_size + endnote_size + textbox_size + \
                headertextbox_size + 1
        else:
            final_cp = text_size
        #Skip across some unused structures to get an offset to the table stream
        offset += (count * 4)
        offset += (66 * 4) + 2 #Add offset from main block + count variable
        clx_offset, clx_size = struct.unpack_from("<II", doc_stream, offset)
        table_stream = self.document.read_stream(table_stream_name)
        magic, size = struct.unpack_from("<BH", table_stream, clx_offset)
        if magic != 0x02:
            raise ReaderError("Not a valid clxt in the table stream")
        #Now read a list of cp offsets showing how the text is broken up
        cp_list = []
        offset = clx_offset + 5
        for i in range(size / 4):
            cp = struct.unpack_from("<I", table_stream, offset)[0]
            cp_list.append(cp)
            offset += 4
            if cp == final_cp:
                break
        if i == (size / 4) - 1:
            raise ReaderError("Parse error - doc file has no final cp")
        #For each cp offset we need to see if the text is 8 or 16 bit, get a
        # stream offset and process the text chunk
        buff = u""
        for i in range(len(cp_list[:-1])):
            fc = struct.unpack_from("<2xI", table_stream, offset)[0]
            stream_offset = fc & (0xFFFFFFFF >> 2)
            compressed = fc & (0x01 << 30)
            next_cp = cp_list[i + 1]
            cp = cp_list[i]
            buff += self._process_block97(stream_offset, cp, next_cp, compressed,
                                            doc_stream)
            offset += 8
        return self.strip_pattern.sub("\r\n", buff)
           
    def _process_block97(self, text_offset, cp, next_cp, compressed,
                         doc_stream):
        #For each text block we need to read the data and try clean it up.
        #The data has special markup for tables and hyperlinks as well as other
        # stuff that can be quite nasty of you dont clean it up
        if compressed:
            text_offset /= 2
            last = (text_offset) + next_cp - cp - 1
            buff = self.non_ascii_pattern8.sub("", doc_stream[text_offset:last])
            buff = self.table_cleanup.sub(" ", unicode(buff , "utf8"))
            return self._clean_hyperlinks(buff)
        else:
            last = text_offset + 2 * (next_cp - cp)
            buff = doc_stream[text_offset:last]
            buff = unicode(buff , "utf16", errors="replace")
            buff = self._clean_hyperlinks(buff)
            buff = self.non_ascii_pattern16.sub("", buff)
            return self.table_cleanup.sub(" ", buff)           

if __name__ == '__main__':
    print DOCReader(sys.argv[1]).extract_text()
   

Sunday, 22 April 2012

WPS File Reader in Python (Round 2)

I've started work on pulling text from the Microsoft Word format (pre Office 2007 doc files) using only python. I had already completed a "passable" Microsoft Works (wps file) reader, but took a few liberties with the format and avoided reading the file properly via the Ole Compact file specification. A few hours messing with the doc file via my hex editor told me that I cant avoid doing things properly this time. So I wrote a rather simplified OleDocument class that simply parses the file and allows extraction of streams (it doesn't touch the mini streams and leaves a few things out that I don't so far need).

So here's the updated wps code with the new OleDocument class. The WPSReader is a little simpler now and seems far more robust. It still needs testing though!

Now I need to start on the doc format. That's going to be far more difficult!

#!/usr/bin/env python
import os
import sys
import struct
import re
from collections import namedtuple

WPSSTRIPPATTERN = re.compile(r"\r")

class ReaderError(Exception): pass

class OleDocument(object):
   
    def __init__(self, file_name):
        self.file_name = file_name
        self.sectors = []
        self.directories = {}
        self._parse_contents()
       
    def _read_fat_sector(self, fat_sector, fd):
        fd.seek(self.sector_size*(fat_sector+1), os.SEEK_SET)
        for i in range(self.sector_size / 4):
            sector = struct.unpack("<I", fd.read(4))[0]
            yield sector               
       
    def _parse_contents(self):
        with open(self.file_name, "rb") as fd:
            sig = fd.read(8)
            if sig != "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1":
                raise ReaderError("Not a valid Ole Storage Document")
            header = fd.read(68)
            sector_shift, mini_sector_shift, fat_sector_count, \
                first_dir_sector, first_mini_sector, mini_sector_count,  \
                = struct.unpack("<22xHH10xII8xII8x", header)
            self.sector_size = 1 << sector_shift
            self.mini_sector_size = 1 << mini_sector_shift
            fat_sectors = []
           
            for i in range(fat_sector_count):
                fat_sectors.append(struct.unpack("<I", fd.read(4))[0])
            for fat_sector in fat_sectors:
                for sector in self._read_fat_sector(fat_sector, fd):
                    self.sectors.append(sector)
               
            #Now read the directories
            buff = ''
            for count, dir_sector in \
                    enumerate(self._get_sectors(first_dir_sector)):
                fd.seek(self.sector_size*dir_sector+self.sector_size,
                        os.SEEK_SET)
                buff += fd.read(self.sector_size)
            for i in range((count+1)*4):
                name, sector, size = struct.unpack("<64s52xII4x",
                                                   buff[i*128:(i+1)*128])
                name = re.sub("\x00", "", unicode(name, "UTF16"))
                self.directories[name] = (sector, size)
               
    def _get_sectors(self, sector):
        while True:
            if sector == 0xFFFFFFFE: #Last directory
                break
            yield sector
            sector = self.sectors[sector]
                       
                               
    def read_stream(self, name):
        name = unicode(name)
        if name not in self.directories:
            raise ReaderError("No stream called %s" % name)
        start, size = self.directories[name]
        buff = ""
        with open(self.file_name, "rb") as fd:
            for sector in self._get_sectors(start):
                fd.seek(self.sector_size*sector+self.sector_size, os.SEEK_SET)
                buff += fd.read(self.sector_size)
                size -= self.sector_size
                if size <= 0:
                    break
        return buff

class WPSReader(object):
   
    def __init__(self, file_name):
        self.document = OleDocument(file_name)
        self.strip_pattern = WPSSTRIPPATTERN

    def _process_entries(self, entry_buff):
        magic, local, next_offset = struct.unpack("<HHI", entry_buff[:8])
        if magic != 0x01F8:
            raise ReaderError("Invalid format - Entry magic tag incorrect")
        entry_pos = 0x08 #2 WORDS & 1 DWORD
        for i in range(local):
            size = struct.unpack("<H", entry_buff[entry_pos:entry_pos+0x2])[0]
            name, offset, entry_size = struct.unpack("<2x4s10xII",
                                        entry_buff[entry_pos:entry_pos+size])
            if name == "TEXT": #Success!
                return (local, 0x00, offset, entry_size)
            entry_pos += size
        return (local, next_offset, 0x00, 0x00) #Needs to be run again
       
    def extract_text(self):
        buff = self.document.read_stream("CONTENTS")
        total_entries = struct.unpack("<12xH",  buff[:14])[0]
        entries_pos = 24
        while True:
            entries, next_offset, text_header_offset, text_size = \
                self._process_entries(buff[entries_pos:])          
            if text_size: #TEXT found
                break
            total_entries -= entries
            if total_entries and next_offset:
                entries_pos = next_offset #Move to next block
            else:
                raise ReaderError("Unable to find TEXT secion. File corrupt?")
        text = buff[text_header_offset:text_header_offset+text_size]
        return self.strip_pattern.sub("\r\n", unicode(text, "UTF16"))
       
       
if __name__ == '__main__':
    reader = WPSReader(sys.argv[1])
    print reader.extract_text()
   

Wednesday, 18 April 2012

Reading Microsoft Works WPS files in Python

* The code below has been updated for a better wps reader go here *

I'm trying to extract text from various file formats for a search engine and trying to avoid any external packages. I have 10 thousand files to extract text from and a lot of them are WPS files (Microsoft Works - you know, that free office suite that comes preinstalled on many windows boxes).

I was opening the files and using regular expressions and some text sanitizing to try and get decent text from the file. Unfortunately, the text is split into blocks so I got some part-words as long as font names and other junk words from the metadata of the files. I had used libwps in the past but didn't want the dependency in my code. Most Windows based document formats from that stage use some sort of OLE-Stream content that is often kind of difficult to get your head around when your staring at the bytes in a hex editor. After a little reading of the libwps code, a few calculations an jumps in Hex Workshop and a few guesses I managed to work out some code to pull text from this format. Its work in progress and it needs some extensive testing (and it'll get it when I'm using it!) but looks good so far:

import re
import struct

WPSMAGICPATTERN = re.compile(r"(CHNKWKS|CHNKINK)")
WPSSTRIPPATTERN = re.compile(r"\r")

class WPSReader(object):
    TEXT_BLOCK = 0x0E00
   
    def __init__(self, file_name):
        self.file_name = file_name
        self.magic_pattern = WPSMAGICPATTERN
        self.strip_pattern = WPSSTRIPPATTERN
       
    def _process_entries(self, entry_buff):
        magic, local, next_offset = struct.unpack("<HHI", entry_buff[:8])
        if magic != 0x01F8:
            raise ReaderError("Invalid format - Entry magic tag incorrect")
        entry_pos = 0x08 #2 WORDS & 1 DWORD
        for i in range(local):
            size = struct.unpack("<H", entry_buff[entry_pos:entry_pos+0x2])[0]
            name, offset, size = struct.unpack("<2x4s10xII", entry_buff[entry_pos:entry_pos+size])
            if name == "TEXT": #Success!
                return (local, 0x00, offset, size)
            entry_pos += size
        return (local, next_offset, 0x00, 0x00) #Needs to be run again
       
    def extract_text(self):
        with open(self.file_name, "rb") as fd:
            buff = fd.read()
        matches = self.magic_pattern.search(buff)
        if not matches:
            raise ReaderError("No 'Magic' block: not a valid WPS file")
        if matches.groups()[0] == "CHNKINK":
            raise ReaderError("Unable to convert a WPS file prior to version 8")
        headers_start = matches.start()
        entries_pos = headers_start + 24   
        total_entries = struct.unpack("<12xH",  buff[headers_start:headers_start+14])[0]
        while True:        
            entries, next_offset, text_header_offset, text_size = \
                    self._process_entries(buff[entries_pos:])          
            if text_size: #TEXT found
                break
            total_entries -= entries
            if total_entries and next_offset:
                entries_pos = next_offset + self.TEXT_BLOCK #Move to next block
            else:
                raise ReaderError("Unable to find TEXT secion. File corrupt?")
        text_offset = text_header_offset + headers_start #Move to start of text
        block_size = min(self.TEXT_BLOCK, text_size)
        text = buff[text_offset:text_offset+block_size]
        text_size -= block_size
        block_size = min(self.TEXT_BLOCK, text_size)
        if text_size:
            text_offset = 0x800 #Seems to always be the location of second block
            text += buff[text_offset:text_offset+block_size]
            text_size -= block_size
        if text_size:
            text_offset = text_header_offset + headers_start + self.TEXT_BLOCK
            text += buff[text_offset:text_offset+text_size]
        return self.strip_pattern.sub("\r\n", unicode(text, "UTF16"))

import sys
print WPSReader(sys.argv[1]).extract_text()

The greatest number of files are the old style Word 95-2003 (doc) files. Now I need to try do the same with those!

Friday, 13 April 2012

ShelobPy: Python File Spider

Added some code to Bitbucket today - https://bitbucket.org/swbsystems/shelobpy

I need a way to pull text out of various document files (Word - both new and old, Open Office, Rich Text, MS Works, PDF, HTML...etc). I had worked on a C project to do this, but the libraries I needed were a pain to install and get working. It had to run on Linux as its going on a web-server and I didn't want to add the Open Office runtime to do the conversions.

So far I have used pyPDF to pull out PDF text, Beautiful Soup to make a little sense of some terrible Word-HTML mark-up and the rest has been done with a little brute force and regular expressions!

It is still __VERY__ hackish, but does use some neat stuff like Natural Language Processing to pull out some decent search terms. Still lots of work to go on it though!


Sunday, 11 December 2011

Beating the GCHQ challenge (part 3)

This final stage of the challenge is a windows-only affair. You get an executable file without much else.

*This challenge is a straight cracking/reverse-engineering exercise. I will not detail how I do this, but I will detail the puzzle that exists in the code.

Running the executable (in a safe, disposable virtual machine of course!) reveals that the executable needs an extra dll. A quick Google search reveals that this is the crypt module of cygwin, so I installed the correct libraries and tried again:


Entering the hostname of the challenge website at least told me that the program also wanted a licence file. From there I dived into the code and found the following:

  • A rather interesting string - "hqDTK7b8K2rvw".
  • Code that searches for a file named "licence.txt", sets aside 24 bytes of zeroed memory and then uses this memory to load the contents of the file.
  • The first 4 bytes of this data are checked against a hard coded value - 0x67636871. This is "gchq" in ascii. So far so good!
  • The next 20 bytes are then passed to the unix "crypt" function with the mystery string ("hqDTK7b8K2rvw") as a salt. The return is again checked against the mystery string and if its the same, the code continues.
  • At this point, the final 12 bytes of the licence are loaded onto the stack and the code branches to a new procedure.
  • This new procedure tries to contact the hostname you specified on port 80 and perform a HTTP  GET request. The URL used for this is interesting: "GET /%s/%x/%x/%x/key.txt HTTP/1.0\r\n\r\n". For the first string in this format specifier, the program passes the mystery string ("hqDTK7b8K2rvw"). The other 3 hex values are the 12 bytes read at the end of the licence.
Here's where it gets interesting. Looking at the crypt man-page, the function discards all but the first 8 bytes of the password and all but the first 2 bytes of the salt. The salt then becomes the first 2 bytes of the encrypted password (that is how the salt and encrypted password can be compared to see if the correct password is given). 

So I need to find a 20 byte password. How about "canyoucrackit.co.uk" a NULL char at the end? Nope. That didn't work :(



I realised that if I tried a brute force attack on this encrypted string it would take ages.  Also, as all but the first 8 chars are discarded, the final 12 bytes would not be revealed by the attack, and this was the information I needed. At this point I was stuck.

Then I remembered the weird information from the other 2 stages:

  • The jumped DWORD in stage 1 - 0xa3bfc2af
  • The firmware values from stage 2 - 0xd2ab1f05, 0xda13f110
Now, trying these values in the URL gives me www.canyoucrackit.co.uk/hqDTK7b8K2rvw/a3bfc2af/d2ab1f05/da13f110/key.txt

Go to that page and instead of a 404 page asking you to try again you get the following text: "Pr0t3ct!on#cyber_security@12*12.2011+". Put this into the form on http://www.canyoucrackit.co.uk/ and you get through to this:





You get a chance to apply for:
  • Cyber Security Specialist - GC10 (£25,446)
  • Senior Cyber Security Specialist - GC9 (£31,152)
And that's that for this year, unless there more in that final block of memory on stage 2!


Friday, 9 December 2011

Beating the GCHQ challenge (part 2)

So after the let-down of having to complete another test after the last one, I found myself with the following javascript file:



//--------------------------------------------------------------------------------------------------
//
// stage 2 of 3
//
// challenge:
//   reveal the solution within VM.mem
//
// disclaimer:
//   tested in ie 9, firefox 6, chrome 14 and v8 shell (http://code.google.com/apis/v8/build.html),
//   other javascript implementations may or may not work.
//
//--------------------------------------------------------------------------------------------------


var VM = {
 
    cpu: {
        ip: 0x00,
   
        r0: 0x00,
        r1: 0x00,
        r2: 0x00,
        r3: 0x00,
   
        cs: 0x00,
        ds: 0x10,
   
        fl: 0x00,
   
        firmware: [0xd2ab1f05, 0xda13f110]
    },
 
    mem: [
        0x31, 0x04, 0x33, 0xaa, 0x40, 0x02, 0x80, 0x03, 0x52, 0x00, 0x72, 0x01, 0x73, 0x01, 0xb2, 0x50,
        0x30, 0x14, 0xc0, 0x01, 0x80, 0x00, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
       
        0x98, 0xab, 0xd9, 0xa1, 0x9f, 0xa7, 0x83, 0x83, 0xf2, 0xb1, 0x34, 0xb6, 0xe4, 0xb7, 0xca, 0xb8,
        0xc9, 0xb8, 0x0e, 0xbd, 0x7d, 0x0f, 0xc0, 0xf1, 0xd9, 0x03, 0xc5, 0x3a, 0xc6, 0xc7, 0xc8, 0xc9,
        0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9,
        0xda, 0xdb, 0xa9, 0xcd, 0xdf, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
        0x26, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9,
        0x7d, 0x1f, 0x15, 0x60, 0x4d, 0x4d, 0x52, 0x7d, 0x0e, 0x27, 0x6d, 0x10, 0x6d, 0x5a, 0x06, 0x56,
        0x47, 0x14, 0x42, 0x0e, 0xb6, 0xb2, 0xb2, 0xe6, 0xeb, 0xb4, 0x83, 0x8e, 0xd7, 0xe5, 0xd4, 0xd9,
        0xc3, 0xf0, 0x80, 0x95, 0xf1, 0x82, 0x82, 0x9a, 0xbd, 0x95, 0xa4, 0x8d, 0x9a, 0x2b, 0x30, 0x69,
        0x4a, 0x69, 0x65, 0x55, 0x1c, 0x7b, 0x69, 0x1c, 0x6e, 0x04, 0x74, 0x35, 0x21, 0x26, 0x2f, 0x60,
        0x03, 0x4e, 0x37, 0x1e, 0x33, 0x54, 0x39, 0xe6, 0xba, 0xb4, 0xa2, 0xad, 0xa4, 0xc5, 0x95, 0xc8,
        0xc1, 0xe4, 0x8a, 0xec, 0xe7, 0x92, 0x8b, 0xe8, 0x81, 0xf0, 0xad, 0x98, 0xa4, 0xd0, 0xc0, 0x8d,
        0xac, 0x22, 0x52, 0x65, 0x7e, 0x27, 0x2b, 0x5a, 0x12, 0x61, 0x0a, 0x01, 0x7a, 0x6b, 0x1d, 0x67,
        0x75, 0x70, 0x6c, 0x1b, 0x11, 0x25, 0x25, 0x70, 0x7f, 0x7e, 0x67, 0x63, 0x30, 0x3c, 0x6d, 0x6a,
        0x01, 0x51, 0x59, 0x5f, 0x56, 0x13, 0x10, 0x43, 0x19, 0x18, 0xe5, 0xe0, 0xbe, 0xbf, 0xbd, 0xe9,
        0xf0, 0xf1, 0xf9, 0xfa, 0xab, 0x8f, 0xc1, 0xdf, 0xcf, 0x8d, 0xf8, 0xe7, 0xe2, 0xe9, 0x93, 0x8e,
        0xec, 0xf5, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   
        0x37, 0x7a, 0x07, 0x11, 0x1f, 0x1d, 0x68, 0x25, 0x32, 0x77, 0x1e, 0x62, 0x23, 0x5b, 0x47, 0x55,
        0x53, 0x30, 0x11, 0x42, 0xf6, 0xf1, 0xb1, 0xe6, 0xc3, 0xcc, 0xf8, 0xc5, 0xe4, 0xcc, 0xc0, 0xd3,
        0x85, 0xfd, 0x9a, 0xe3, 0xe6, 0x81, 0xb5, 0xbb, 0xd7, 0xcd, 0x87, 0xa3, 0xd3, 0x6b, 0x36, 0x6f,
        0x6f, 0x66, 0x55, 0x30, 0x16, 0x45, 0x5e, 0x09, 0x74, 0x5c, 0x3f, 0x29, 0x2b, 0x66, 0x3d, 0x0d,
        0x02, 0x30, 0x28, 0x35, 0x15, 0x09, 0x15, 0xdd, 0xec, 0xb8, 0xe2, 0xfb, 0xd8, 0xcb, 0xd8, 0xd1,
        0x8b, 0xd5, 0x82, 0xd9, 0x9a, 0xf1, 0x92, 0xab, 0xe8, 0xa6, 0xd6, 0xd0, 0x8c, 0xaa, 0xd2, 0x94,
        0xcf, 0x45, 0x46, 0x67, 0x20, 0x7d, 0x44, 0x14, 0x6b, 0x45, 0x6d, 0x54, 0x03, 0x17, 0x60, 0x62,
        0x55, 0x5a, 0x4a, 0x66, 0x61, 0x11, 0x57, 0x68, 0x75, 0x05, 0x62, 0x36, 0x7d, 0x02, 0x10, 0x4b,
        0x08, 0x22, 0x42, 0x32, 0xba, 0xe2, 0xb9, 0xe2, 0xd6, 0xb9, 0xff, 0xc3, 0xe9, 0x8a, 0x8f, 0xc1,
        0x8f, 0xe1, 0xb8, 0xa4, 0x96, 0xf1, 0x8f, 0x81, 0xb1, 0x8d, 0x89, 0xcc, 0xd4, 0x78, 0x76, 0x61,
        0x72, 0x3e, 0x37, 0x23, 0x56, 0x73, 0x71, 0x79, 0x63, 0x7c, 0x08, 0x11, 0x20, 0x69, 0x7a, 0x14,
        0x68, 0x05, 0x21, 0x1e, 0x32, 0x27, 0x59, 0xb7, 0xcf, 0xab, 0xdd, 0xd5, 0xcc, 0x97, 0x93, 0xf2,
        0xe7, 0xc0, 0xeb, 0xff, 0xe9, 0xa3, 0xbf, 0xa1, 0xab, 0x8b, 0xbb, 0x9e, 0x9e, 0x8c, 0xa0, 0xc1,
        0x9b, 0x5a, 0x2f, 0x2f, 0x4e, 0x4e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
    ],
 
    exec: function()
    {
    // virtual machine architecture
    // ++++++++++++++++++++++++++++
    //
    // segmented memory model with 16-byte segment size (notation seg:offset)
    //
    // 4 general-purpose registers (r0-r3)
    // 2 segment registers (cs, ds equiv. to r4, r5)
    // 1 flags register (fl)
    //
    // instruction encoding
    // ++++++++++++++++++++
    //
    //           byte 1               byte 2 (optional)
    // bits      [ 7 6 5 4 3 2 1 0 ]  [ 7 6 5 4 3 2 1 0 ]
    // opcode      - - -            
    // mod               -          
    // operand1            - - - -
    // operand2                         - - - - - - - -
    //
    // operand1 is always a register index
    // operand2 is optional, depending upon the instruction set specified below
    // the value of mod alters the meaning of any operand2
    //   0: operand2 = reg ix
    //   1: operand2 = fixed immediate value or target segment (depending on instruction)
    //
    // instruction set
    // +++++++++++++++
    //
    // Notes:
    //   * r1, r2 => operand 1 is register 1, operand 2 is register 2
    //   * movr r1, r2 => move contents of register r2 into register r1
    //
    // opcode | instruction | operands (mod 0) | operands (mod 1)
    // -------+-------------+------------------+-----------------
    // 0x00   | jmp         | r1               | r2:r1
    // 0x01   | movr        | r1, r2           | rx,   imm
    // 0x02   | movm        | r1, [ds:r2]      | [ds:r1], r2
    // 0x03   | add         | r1, r2           | r1,   imm
    // 0x04   | xor         | r1, r2           | r1,   imm
    // 0x05   | cmp         | r1, r2           | r1,   imm
    // 0x06   | jmpe        | r1               | r2:r1
    // 0x07   | hlt         | N/A              | N/A
    //
    // flags
    // +++++
    //
    // cmp r1, r2 instruction results in:
    //   r1 == r2 => fl = 0
    //   r1 < r2  => fl = 0xff
    //   r1 > r2  => fl = 1
    //
    // jmpe r1
    //   => if (fl == 0) jmp r1
    //      else nop
   
        throw "VM.exec not yet implemented";
    }
 
};


//--------------------------------------------------------------------------------------------------


try
{
    VM.exec();
}
catch(e)
{
    alert('\nError: ' + e + '\n');
}


//--------------------------------------------------------------------------------------------------
 


So we have the details of an instruction set, three segments of memory and some variables that represent registers (oh...and some suspicious looking firmware values!).

The task here is to write a simple disassembler, be able to store a set of decoded instructions and to then execute them as part of a virtual machine. The tricky part is that the first section of code decodes the next one, so you must disassemble, build the instructions, run, disassemble, build more instructions then run again. After this you find the following in the second segment of memory:

'G' , 'E' , 'T' , ' ' , '/' , 'd' , 'a' , '7' , '5' , '3' , '7' , '0' , 'f' , 'e' , '1' , '5' ,
'c' , '4' , '1' , '4' , '8' , 'b' , 'd' , '4' , 'c' , 'e' , 'e' , 'c' , '8' , '6' , '1' , 'f' ,
'b' , 'd' , 'a' , 'a' , '5' , '.' , 'e' , 'x' , 'e' , ' ' , 'H' , 'T' , 'T' , 'P' , '/' , '1' ,

This is the address for the next stage of the challenge.

Interestingly, the third segment seems unchanged - very suspicious as there's not much spare information in any of these challenges.

My final code for this stage:



//--------------------------------------------------------------------------------------------------
//
// stage 2 of 3
//
// challenge:
//   reveal the solution within VM.mem
//
// disclaimer:
//   tested in ie 9, firefox 6, chrome 14 and v8 shell (http://code.google.com/apis/v8/build.html),
//   other javascript implementations may or may not work.
//
//--------------------------------------------------------------------------------------------------


var VM = {  
        cpu: {
        ip: 0x00,
       
        r0: 0x00,
        r1: 0x00,
        r2: 0x00,
        r3: 0x00,
       
        cs: 0x00,
        ds: 0x10,
     
        fl: 0x00,
     
        firmware: [0xd2ab1f05, 0xda13f110]
    },
   
    mem: [
        0x31, 0x04, 0x33, 0xaa, 0x40, 0x02, 0x80, 0x03, 0x52, 0x00, 0x72, 0x01, 0x73, 0x01, 0xb2, 0x50,
        0x30, 0x14, 0xc0, 0x01, 0x80, 0x00, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
       
        0x98, 0xab, 0xd9, 0xa1, 0x9f, 0xa7, 0x83, 0x83, 0xf2, 0xb1, 0x34, 0xb6, 0xe4, 0xb7, 0xca, 0xb8,
        0xc9, 0xb8, 0x0e, 0xbd, 0x7d, 0x0f, 0xc0, 0xf1, 0xd9, 0x03, 0xc5, 0x3a, 0xc6, 0xc7, 0xc8, 0xc9,
        0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9,
        0xda, 0xdb, 0xa9, 0xcd, 0xdf, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
        0x26, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9,
        0x7d, 0x1f, 0x15, 0x60, 0x4d, 0x4d, 0x52, 0x7d, 0x0e, 0x27, 0x6d, 0x10, 0x6d, 0x5a, 0x06, 0x56,
        0x47, 0x14, 0x42, 0x0e, 0xb6, 0xb2, 0xb2, 0xe6, 0xeb, 0xb4, 0x83, 0x8e, 0xd7, 0xe5, 0xd4, 0xd9,
        0xc3, 0xf0, 0x80, 0x95, 0xf1, 0x82, 0x82, 0x9a, 0xbd, 0x95, 0xa4, 0x8d, 0x9a, 0x2b, 0x30, 0x69,
        0x4a, 0x69, 0x65, 0x55, 0x1c, 0x7b, 0x69, 0x1c, 0x6e, 0x04, 0x74, 0x35, 0x21, 0x26, 0x2f, 0x60,
        0x03, 0x4e, 0x37, 0x1e, 0x33, 0x54, 0x39, 0xe6, 0xba, 0xb4, 0xa2, 0xad, 0xa4, 0xc5, 0x95, 0xc8,
        0xc1, 0xe4, 0x8a, 0xec, 0xe7, 0x92, 0x8b, 0xe8, 0x81, 0xf0, 0xad, 0x98, 0xa4, 0xd0, 0xc0, 0x8d,
        0xac, 0x22, 0x52, 0x65, 0x7e, 0x27, 0x2b, 0x5a, 0x12, 0x61, 0x0a, 0x01, 0x7a, 0x6b, 0x1d, 0x67,
        0x75, 0x70, 0x6c, 0x1b, 0x11, 0x25, 0x25, 0x70, 0x7f, 0x7e, 0x67, 0x63, 0x30, 0x3c, 0x6d, 0x6a,
        0x01, 0x51, 0x59, 0x5f, 0x56, 0x13, 0x10, 0x43, 0x19, 0x18, 0xe5, 0xe0, 0xbe, 0xbf, 0xbd, 0xe9,
        0xf0, 0xf1, 0xf9, 0xfa, 0xab, 0x8f, 0xc1, 0xdf, 0xcf, 0x8d, 0xf8, 0xe7, 0xe2, 0xe9, 0x93, 0x8e,
        0xec, 0xf5, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
       
        0x37, 0x7a, 0x07, 0x11, 0x1f, 0x1d, 0x68, 0x25, 0x32, 0x77, 0x1e, 0x62, 0x23, 0x5b, 0x47, 0x55,
        0x53, 0x30, 0x11, 0x42, 0xf6, 0xf1, 0xb1, 0xe6, 0xc3, 0xcc, 0xf8, 0xc5, 0xe4, 0xcc, 0xc0, 0xd3,
        0x85, 0xfd, 0x9a, 0xe3, 0xe6, 0x81, 0xb5, 0xbb, 0xd7, 0xcd, 0x87, 0xa3, 0xd3, 0x6b, 0x36, 0x6f,
        0x6f, 0x66, 0x55, 0x30, 0x16, 0x45, 0x5e, 0x09, 0x74, 0x5c, 0x3f, 0x29, 0x2b, 0x66, 0x3d, 0x0d,
        0x02, 0x30, 0x28, 0x35, 0x15, 0x09, 0x15, 0xdd, 0xec, 0xb8, 0xe2, 0xfb, 0xd8, 0xcb, 0xd8, 0xd1,
        0x8b, 0xd5, 0x82, 0xd9, 0x9a, 0xf1, 0x92, 0xab, 0xe8, 0xa6, 0xd6, 0xd0, 0x8c, 0xaa, 0xd2, 0x94,
        0xcf, 0x45, 0x46, 0x67, 0x20, 0x7d, 0x44, 0x14, 0x6b, 0x45, 0x6d, 0x54, 0x03, 0x17, 0x60, 0x62,
        0x55, 0x5a, 0x4a, 0x66, 0x61, 0x11, 0x57, 0x68, 0x75, 0x05, 0x62, 0x36, 0x7d, 0x02, 0x10, 0x4b,
        0x08, 0x22, 0x42, 0x32, 0xba, 0xe2, 0xb9, 0xe2, 0xd6, 0xb9, 0xff, 0xc3, 0xe9, 0x8a, 0x8f, 0xc1,
        0x8f, 0xe1, 0xb8, 0xa4, 0x96, 0xf1, 0x8f, 0x81, 0xb1, 0x8d, 0x89, 0xcc, 0xd4, 0x78, 0x76, 0x61,
        0x72, 0x3e, 0x37, 0x23, 0x56, 0x73, 0x71, 0x79, 0x63, 0x7c, 0x08, 0x11, 0x20, 0x69, 0x7a, 0x14,
        0x68, 0x05, 0x21, 0x1e, 0x32, 0x27, 0x59, 0xb7, 0xcf, 0xab, 0xdd, 0xd5, 0xcc, 0x97, 0x93, 0xf2,
        0xe7, 0xc0, 0xeb, 0xff, 0xe9, 0xa3, 0xbf, 0xa1, 0xab, 0x8b, 0xbb, 0x9e, 0x9e, 0x8c, 0xa0, 0xc1,
        0x9b, 0x5a, 0x2f, 0x2f, 0x4e, 0x4e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
    ],


    decoded_instructions: {},


    decode_mem: function()
    {
        var pos = this.decode_pos;
        var b1 = this.mem[(this.decode_offset * 16) + this.decode_pos];
        var b2 = this.mem[(this.decode_offset * 16) + this.decode_pos + 1];
        var operand2 = b2;
        var mod = ((0x10 & b1) >> 4);
        var operand1 = (0x07 & b1);
        var instruction = ((0xE0 & b1) >> 5);
        if(instruction == 0x07){ //hlt
            this.decoded_instructions[pos] = ["htl"];
            return false;
        }
        if(mod == 0){
            switch(instruction){
            case 0x00: //"jmp"
                this.decode_pos += 1;
                this.decoded_instructions[pos] = ["jmp", "r" + String(operand1)];
                return true;
            case 0x01: //"movr"
                this.decode_pos += 2;
                this.decoded_instructions[pos] =
                    this.swap_out_segment_registers(["movr", "r" + String(operand1), "r" +
                                                     String(operand2)]);
                return true;
            case 0x02: //"movm"
                this.decode_pos += 2;  
                this.decoded_instructions[pos] =
                    this.swap_out_segment_registers(["movm", "r" + String(operand1), "ds:r" +
                                                     String(operand2)]);
                return true;
            case 0x03: //"add"
                this.decode_pos += 2;
                this.decoded_instructions[pos] =
                    this.swap_out_segment_registers(["add", "r" + String(operand1), "r" +
                                                     String(operand2)]);
                return true;
            case 0x04: //"xor"
                this.decode_pos += 2;
                this.decoded_instructions[pos] =
                    this.swap_out_segment_registers(["xor", "r" + String(operand1), "r" +
                                                     String(operand2)]);
                return true;
            case 0x05: //"cmp"
                this.decode_pos += 2;
                this.decoded_instructions[pos] =
                    this.swap_out_segment_registers(["cmp", "r" + String(operand1), "r" +
                                                     String(operand2)]);
                return true;
            case 0x06: //"jmpe"
                this.decode_pos += 1;
                this.decoded_instructions[pos] = ["jmpe", "r" + String(operand1)];
                return true;
            }
            } else { //mod == 1
            switch(instruction){
            case 0x00: //"jmp"
                this.decode_pos += 2;
                this.decode_offset = operand2 * 16;
                this.decoded_instructions[pos] = ["jmp", "s" + String(operand2) + ":r" +
                                                  String(operand1)];
                return false;
            case 0x01: //"movr"
                this.decode_pos += 2;
                this.decoded_instructions[pos] =
                    this.swap_out_segment_registers(["movr", "r" + String(operand1), operand2]);
                return true;
            case 0x02: //"movm"
                this.decode_pos += 2;
                this.decoded_instructions[pos] =
                    this.swap_out_segment_registers(["movm", "ds:r" + String(operand1), "r" +
                                                     String(operand2)]);
                return true;
            case 0x03: //"add"
                this.decode_pos += 2;
                this.decoded_instructions[pos] =
                    this.swap_out_segment_registers(["add", "r" + String(operand1), operand2]);
                return true;
            case 0x04: //"xor"
                this.decode_pos += 2;
                this.decoded_instructions[pos] =
                    this.swap_out_segment_registers(["xor", "r" + String(operand1), operand2]);
                return true;
            case 0x05: //"cmp"
                this.decode_pos += 2;
                this.decoded_instructions[pos] =
                    this.swap_out_segment_registers(["cmp", "r" + String(operand1), operand2]);
                return true;
            case 0x06: //"jmpe"
                this.decode_pos += 2;
                this.decode_offset = operand2 * 16;
                this.decoded_instructions[pos] = ["jmpe", "s" + String(operand2) + ":r" +
                                                  String(operand1)];
                return false;
            }
        }
    },
    run_instructions: function(){
        document.write("<table border=\"1\">");
        var looping = true;
        while(looping && this.cpu.ip < 256){
            if(!(this.cpu.ip in this.decoded_instructions)){
                this.cpu.ip++;
                continue;
            }
            var instruction = this.decoded_instructions[this.cpu.ip];
            switch(instruction[0]){
            case "jmp":
                if(instruction[1][0] == "s"){
                    var re = /s(\d*):(r\d)/;
                    var vals = instruction[1].match(re);
                    this.cpu.cs = Number(vals[1]);
                    this.cpu.ip = eval("this.cpu." + vals[2]) - 1;
                    looping = false;
                } else {
                    this.cpu.ip = eval("this.cpu." + instruction[1]) - 1;
                }
                break;
            case "jmpe":
                if(this.cpu.fl != 0){
                    break;
                }
                if(instruction[1][0] == "s"){
                    var re = /s(\d*):(r\d)/;
                    var vals = instruction[1].match(re);
                    this.cpu.cs = Number(vals[1]);
                    this.cpu.ip = eval("this.cpu." + vals[2]) - 1;
                    looping = false;
                } else {
                    this.cpu.ip = eval("this.cpu." + instruction[1]) - 1;
                }
                break;
            case "movr":
                if(instruction[2][0] == "r" || instruction[2][0] == "c" ||
                        instruction[2][0] == "d"){
                    eval("this.cpu." + instruction[1] + " = this.cpu." + instruction[2])
                } else {
                    eval("this.cpu." + instruction[1] + " = " + instruction[2]);
                }
                break;
            case "movm":
                var re = /ds:(r\d*)/;
                if(instruction[1][0] == "d"){ //mod 1
                    var vals = instruction[1].match(re);
                    var r = eval("this.cpu." + vals[1]);
                    this.mem[r + (this.cpu.ds * 16)] = eval("this.cpu." + instruction[2]);
                } else {
                    var vals = instruction[2].match(re);
                    var r = eval("this.cpu." + vals[1]);
                    eval("this.cpu." + instruction[1] + " = this.mem[r + " +
                         (this.cpu.ds * 16) + "]");
                }
                break;
            case "add":
                if(instruction[2][0] == "r" || instruction[2][0] == "c" ||
                        instruction[2][0] == "d"){
                    eval("this.cpu." + instruction[1] + " += this.cpu." + instruction[2]);
                } else {
                    eval("this.cpu." + instruction[1] + " += " + instruction[2]);
                }  
                break;
            case "xor":
                if(instruction[2][0] == "r"){
                    eval("this.cpu." + instruction[1] + " ^= this.cpu." + instruction[2]);
                } else {
                    eval("this.cpu." + instruction[1] + " ^= " + instruction[2]);
                }  
                break;
            case "cmp":
                if(instruction[2][0] == "r"){
                    if(eval("this.cpu." + instruction[1]) == eval("this.cpu." + instruction[2])){
                 this.cpu.fl = 0;
                    } else if(eval("this.cpu." + instruction[1]) > eval("this.cpu." +
                                                                        instruction[2])){
                 this.cpu.fl = 1;
                    } else{
                 this.cpu.fl = 0xFF;
                    }
                } else {
                    if(eval("this.cpu." + instruction[1]) == instruction[2]){
                 this.cpu.fl = 0;
                    } else if(eval("this.cpu." + instruction[1]) > instruction[2]){
                 this.cpu.fl = 1;
                    } else{
                 this.cpu.fl = 0xFF;
                    }
                }
                break;
            case "hlt":
                looping = false;
                break;         
            }
            this.cpu.ip++;
            document.write("<tr>");
            document.write("<td>" + this.get_instruction_text(instruction));
            document.write("</td><td>" + this.get_state() + "</td></tr>");
        }
        document.write("</table></p>");
        this.decoded_instructions = {}
        this.decode_offset = this.cpu.cs;
        this.decode_pos = 0;
    },
    swap_out_segment_registers: function(instruction){
        for(var i = 0; i < instruction.length;i++){
            if(instruction[i] == "r4"){instruction[i] = "cs";}
            if(instruction[i] == "r5"){instruction[i] = "ds";}
        }
        return instruction;
   },
    print_mem: function(){
        document.write("Memory Dump<br />");
        for(var i = 0; i < this.mem.length; i++){
            if(this.mem[i] < 0x20 || this.mem[i] > 0x7E){
                var v = this.mem[i].toString(16);
                if(v.length == 1){
                    v = "0x0" + v;
                } else {
                    v = "0x" + v;
                }
                document.write(v + ", ");
            } else {
                document.write("'" + String.fromCharCode(this.mem[i]) + "' , ");
            }
            if(((i + 1) % 16) == 0){
                document.write("<br />");
            }
            if(((i + 1) % 256) == 0){
                document.write("<br />");
            }
        }
    },
    get_state: function(){
        var state = "ip = " + this.cpu.ip + ", ";
        state += "cs = " + this.cpu.cs + ", ";
        state += "ds = " + this.cpu.ds + ", ";
        state += "r0 = " + this.cpu.r0 + ", ";
        state += "r1 = " + this.cpu.r1 + ", ";
        state += "r2 = " + this.cpu.r2 + ", ";
        state += "r3 = " + this.cpu.r3 + ", ";
        state += "fl = " + this.cpu.fl + ".";
        return state;
    },
    get_instruction_text: function(instruction){
        var text = instruction[0];
        if(instruction.length == 2){
            text += " " + instruction[1];
        }
        if(instruction.length == 3){
            text += " " + instruction[1] + ", " + String(instruction[2]);
        }
        return text;
    },
    print_instructions: function(){
        document.write("<p>");
        document.write("Decoded instructions:<br />");
        document.write("<table border=\"1\">");
   
        for(var i = 0; i < 256 ; i++){
            if(!(i in this.decoded_instructions)){
                continue;
            }
            document.write("<tr>");
            var instruction = this.decoded_instructions[i];
            document.write("<td>At pos " + i + "</td>");
            document.write("<td>" + this.get_instruction_text(instruction));
            document.write("</td><td>");
        }
        document.write("</table></p>");
    },
    decode_pos : 0x00,
    decode_offset: 0x00,
    decode: function(){
        while(this.decode_mem()){ }
        this.print_instructions()
        this.run_instructions()
        this.print_mem()
        while(this.decode_mem()){ }
        this.print_instructions()
        this.run_instructions()
        this.print_mem()
/*
        this.decode_offset = 22;
        this.decode_pos = 0;
        while(this.decode_mem()){ }
        this.print_instructions()
        this.print_mem()
*/

    },
    instructions: ["jmp", "movr","movm", "add","xor","cmp","jmpe", "hlt"],
    exec: function()
    {
 // virtual machine architecture
 // ++++++++++++++++++++++++++++
 //
 // segmented memory model with 16-byte segment size (notation seg:offset)
 //
 // 4 general-purpose registers (r0-r3)
 // 2 segment registers (cs, ds equiv. to r4, r5)
 // 1 flags register (fl)
 //
 // instruction encoding
 // ++++++++++++++++++++
 //
 //           byte 1               byte 2 (optional)
 // bits      [ 7 6 5 4 3 2 1 0 ]  [ 7 6 5 4 3 2 1 0 ]
 // opcode      - - -            
 // mod               -          
 // operand1            - - - -
 // operand2                         - - - - - - - -
 //
 // operand1 is always a register index
 // operand2 is optional, depending upon the instruction set specified below
 // the value of mod alters the meaning of any operand2
 //   0: operand2 = reg ix
 //   1: operand2 = fixed immediate value or target segment (depending on instruction)
 //
 // instruction set
 // +++++++++++++++
 //
 // Notes:
 //   * r1, r2 => operand 1 is register 1, operand 2 is register 2
 //   * movr r1, r2 => move contents of register r2 into register r1
 //
 // opcode | instruction | operands (mod 0) | operands (mod 1)
 // -------+-------------+------------------+-----------------
 // 0x00   | jmp         | r1               | r2:r1
 // 0x01   | movr        | r1, r2           | rx,   imm
 // 0x02   | movm        | r1, [ds:r2]      | [ds:r1], r2
 // 0x03   | add         | r1, r2           | r1,   imm
 // 0x04   | xor         | r1, r2           | r1,   imm
 // 0x05   | cmp         | r1, r2           | r1,   imm
 // 0x06   | jmpe        | r1               | r2:r1
 // 0x07   | hlt         | N/A              | N/A
 //
 // flags
 // +++++
 //
 // cmp r1, r2 instruction results in:
 //   r1 == r2 => fl = 0
 //   r1 < r2  => fl = 0xff
 //   r1 > r2  => fl = 1
 //
 // jmpe r1
 //   => if (fl == 0) jmp r1
 //      else nop
 //    throw "VM.exec not yet implemented";
    }
 
};


//--------------------------------------------------------------------------------------------------


try
{
    VM.decode();
//    VM.exec();
}
catch(e)
{
    alert('\nError: ' + e + '\n');
}

//--------------------------------------------------------------------------------------------------
 

Use the following html file and load it in Chrome or Firefox to see it work:

<html>
    <head>
        <script type="text/javascript" src="stage2.js">
        </script>
    </head>
    <body>
    </body>
</html>


Now onto stage 3...