# Crappy script to turn the output of a Radare2 disassembly listing
# into a HTML document (which still looks like plain text),
# with links from caller (e.g. after 'bl') to callee ("(fcn)").
#
# By Wolf, DL4YHF.
# Latest modifications:
#    2017-03-04 : Added links from functions to variables (and back)
#


import sys

def sym2link(symbol):
    return '<a href="#'+symbol+'">'+symbol+'</a>'

def sym2noLink(symbol):
    return '<font color="red"><u>'+symbol+'</u></font>'

def sym2anchor(symbol):
    return '<a id="'+symbol+'"><b>'+symbol+'</b></a>'

def parseHex(sAddr): # in: string, nothing else
    try:             # return: int, nothing else
       val = int(sAddr,16)  # throws exception when NOT hex
    except ValueError:
       val = 0
    return val

def isRAM(addr): # 2017-03-04: some SFRs are treated like RAM to have them listed
    if 0x20000000 <= addr <= 0x2001FFFF: # 'SRAM1' + 'SRAM2' in STM32F40x
        return True # not Pythonic but debugger friendly (breakpoint HERE)
    elif addr in [0x40000000,0x40000400,0x40000800,0x40000c00]: # Timer2 .. Timer5 ?
        return True
    elif addr in [0x40001000,0x40001400,0x40001800,0x40001c00]: # Timer6 .. Timer13 ?
        return True
    elif addr in [0x40002000,0x40002800,0x40002C00,0x40003000]: # Timer14, RTC, WWDG, IWDG ?
        return True
    elif addr in [0x40003400,0x40003800,0x40003C00,0x40004000]: # I2S2ext, SPI2/I2S2, SPI3/I2S3, I2S3ext ?
        return True
    elif addr in [0x40004400,0x40004800,0x40004C00,0x40005000]: # UART2, UART3, UART4, UART5 ?
        return True
    elif addr in [0x40005400,0x40005800,0x40005C00,0x40007400]: # I2C1 , U2C2 , I2C3 , DAC   ?
        return True
    elif addr in [0x40007800,0x40007c00,0x40010C00,0x40010400]: # UART7, UART8, TIM1 , TIM8  ?
        return True
    elif addr in [0x40011000,0x40011400,0x40012000,0x40013000]: # UART1, UART6, ADC1 , SPI1  ?
        return True
    elif addr in [0x40012100,0x40012200                      ]: # ADC2 , ADC3 ?
        return True
    elif addr in [0x40013400,0x40013800,0x40013c00,0x40014000]: # SPI4 , SYSCFG, EXTI, TIM9  ?
        return True
    elif addr in [0x40014400,0x40014800,0x40015000,0x40025400]: # TIM10, TIM11 , SPI5, SPI5  ?
        return True
    elif addr in [0x40020000,0x40020400,0x40020800,0x40020C00]: # GPIOA, B, C, D  ?
        return True
    elif addr in [0x40021000,0x40021400,0x40021800,0x40021C00]: # GPIOE, F, G, H  ?
        return True
    elif addr in [0x40022000,0x40022400,0x40022800,0x40023000]: # GPIOI, J, K, CRC ?
        return True
    else:
        return False


def getNameOrAddressOfSomethingInRAM(line):
    column = line.find(" ldr", 40) # ldr or ldr.w ?
    if column >= 40:  # should be followed by Rn + name or address .
        # the 2nd operand is often an address in the function's
        # literal pool. Radare2 shows the VALUE of the literal
        # IN THE COMMENT (same line), e.g.:
        # ldr r1, [0x0804fc06] ; [0x804fc04:4]=0x2001e5f0 radio_status_1
        #    line[83..91] looks like RAM___|_______|
        # Non-Pythonic (who cares) but debugger-friendly
        token = line[83:]
        if token[:9] in [":4]=0x200",":4]=0x400","lit=0x200"]:  # this MAY be a RAM- or SFR- address ..
            addr_and_sym_name = token[4:]
            # e.g. "0x2001e5f0 radio_status_1" or just "0x2001e5f0"
            addr = parseHex(addr_and_sym_name.split(' ')[0])  # parse hex address
            if isRAM(addr):  # valid address in RAM, guess it's a variable
                sym_name = addr_and_sym_name.split(' ')[-1]
                # sym_name may be the hex address itself ! That's intended.
                return sym_name
    return "" # whatever it it, it's not "something in RAM"

class DisasmToHTML:
    in_lines = []  # array with all lines loaded from the input file
    functions= {}  # all functions CALLED FROM or SHOWN IN the listing
    funcs_listed = set() # all functions really SHOWN in the listing
    variables= {}  # anything in RAM referenced from listed functions,
                   # using hex address as name if not in symbol table
    vars_listed = set()  # all variables (RAM addr) in the symbol table,
                    # also using hex address if not a known symbol
    current_function = ""

    def __init__(self):
        pass

    def writeHTMLPart1(self):
        self.of.write("<!doctype html>\n<html>\n<head>\n<style>\n" )

        # Styles for a multi-column index (function overview or similar) :
        self.of.write("ul#MultiColumn{\n")
        self.of.write(" -moz-column-count: 3;\n")  # Will web browsers..
        self.of.write(" -moz-column-gap: 10px;\n") # ..ever speak a common language ?
        self.of.write(" -webkit-column-count: 3;\n")
        self.of.write(" -webkit-column-gap: 10px;\n")
        self.of.write(" column-count: 3;\n")
        self.of.write(" column-gap: 10px;\n")
        self.of.write(" }\n")
        self.of.write("ul#MultiColumn li{\n")
        self.of.write("  line-height: 1.5em;\n")
        self.of.write("  display: block;\n")
        self.of.write(" }\n")
        self.of.write("ul#MultiColumn .double li {\n")
        self.of.write("   width: 50%;\n")
        self.of.write(" }\n")
        self.of.write("</style>\n</head>\n<body>\n")

    def addFunctionAndCaller(self, sym_name, current_function):
        if len(sym_name) > 0:
            if sym_name not in self.functions:
                # there's no known caller for this function yet
                # (otherwise it would already be a key in 'functions').
                # So FIRST append 'sym_name' to 'functions' ...
                self.functions[sym_name] = set()
            if current_function != '':
                # Function 'sym_name' is being called from another
                #   function (current_function) so add it to the *set* of callers.
                # (the same function shall not appear twice in the "list" of callers,
                #  thus use a Python-set here, not a Python-list.
                self.functions[sym_name].add(current_function)

    def addVariableReference(self, var_name, current_function):
        if len(var_name) > 0:
            if var_name not in self.variables:
                # seems to be the FIRST reference to this variable
                # (otherwise it would already be a key in 'variables').
                # So FIRST append 'var_name' to 'variables' ...
                self.variables[var_name] = set()
            if current_function != '':
                # Variable 'var_name' is being used in one of the 'named'
                #   functions (current_function) so add it to the *set* of references.
                self.variables[var_name].add(current_function)

    def appendListOfVarRefs(self, out_line, var_name ):
        var_users = self.variables[var_name]
        if var_users: # Are there any known 'users' of this variable ?
            # append a list with HTML links to them (names, not addresses)
            # The purpose of this listing is an easy-to-use overview;
            #  the rest will be done in a Radare2- or ARM-debugger session.
            out_line += ' used by: '
            # 'var_users' is a SET which will be converted into a
            #  space-separated list of hyperlinks here :
            for var_user in var_users:
               # If the line gets too long, flush it and begin
               # a new. May look ugly on a narrow screen, but
               # the raw Radare2 output had over 130 columns, too
               if len(out_line) > 220:
                  self.of.write(out_line + "\n")
                  out_line = '              |__ '
               if var_user in self.funcs_listed:
                  out_line = out_line + ' ' + sym2link( var_user )
               else:
                  out_line = out_line + ' ' + sym2noLink( var_user )
        return out_line

    def processFile(self, filename_without_extension):
        self.current_function = ""

        # Read the raw disassembler listing into memory
        # (the result would be an array of strings in C, guess here it's a list)
        with open(filename_without_extension+'.txt') as infile:
            self.in_lines = infile.read().splitlines(False)
            # no need to close infile here .. "with" will close it

        # Pass 1 : Find function names etc, and store them in a dictionary
        for line in self.in_lines:
            # look for special sequences in the disassembly created by Radare2:
            if line.startswith("/ (fcn) ") : # in an annotated function now
                # e.g. Radare2 output: "/ (fcn) Reset_Handler 8"
                sym_name = line.rsplit(' ',2)[1]
                # rsplit explained at docs.python.org/2/library/stdtypes.html
                self.addFunctionAndCaller(sym_name, "")
                self.funcs_listed.add(sym_name) # mark as LISTED function,
                   # so it will be shown in the index, and displayed as a
                   # 'clickable' link (not a red 'broken' link) later.
                current_function = sym_name # here: for annotated FUNCTION
            elif line[12:14]=="0x" : # Looks like Radare2's disassembly
                # Not sure if Radare2 always aligns mnemonics to column 44
                # so look for subroutine calls beginning in column 40:
                token  = " bl " # token is a 'str' (not a 'string' !!)
                column = line.find(token,40)
                if column >= 40 : # should be followed by callee's name or address
                    # Because many callees are not annotated yet, treat their
                    # hexadecimal address like a name, so we can link to them .
                    # e.g. 'RCC_Init' called from 'SystemInit', but also
                    #      '0x804e102' called from '0x0804e2f4'.
                    # Beware: Radare2 omits leading zeroes in OPERANDS,
                    #         but not in the ADDRESS column !
                    sym_name = line[column:].split()[1]
                    # About split: docs.python.org/2.7/library/strings.html .
                    #   split[0] would be the mnemonic (branch with link)
                    #   split[1] should be the operand (=callee) .
                    # sym_name must be a function (because it's CALLED here),
                    # which isn't necessarily disassembled yet !
                    # Despite that, list it in the dictionary 'functions',
                    # to avoid TWO passes (one to find all functions,
                    # and another to find all CALLERS of those functions).
                    # Expected to get here (1st time) with:
                    # sym_name="RCC_Init",  current_function="SystemInit".
                    self.addFunctionAndCaller( sym_name, current_function)
                # end if < saw instruction ' bl ' in a disassembly line >

                # Extract a list ("dictionary") of references to RAM:
                sym_name = getNameOrAddressOfSomethingInRAM(line)
                if sym_name != "":
                   # sym_name may be the hex address itself ! That's intended.
                   self.addVariableReference( sym_name, current_function)
                #end if < .. something in RAM >

            # end if < line with a hex address in column 12 (?) >
            elif line[12:16] == ";-- ":  # another annotation in Radare2 ?
                # This is not the same as an annotated FUNCTION (R2: "af+")
                # but it may be the target for a branch, call, or IRQ handler,
                #     e.g.: "|           ;-- TIM7_DAC_IRQHandler:"
                # To keep it simple, treat the symbol like a function:
                sym_name = line[16:-1] # exclude the trailing colon !
                self.addFunctionAndCaller(sym_name, current_function)
                self.funcs_listed.add(sym_name) # mark as LISTED function, too
                current_function = sym_name  # here: for 'f'lagged label (?)
                # (useful for interrupt vectors that aren't "af+"ed)
            elif line[12:14]==" /" : # Looks like an 'annotated hexdump' (pxa)
                sym_name = line[14:].strip() # used to show font headers, etc
                self.addFunctionAndCaller(sym_name, "")
                self.funcs_listed.add(sym_name) # no executable code, anyway..
                # a hex-dump is usually a table, constant, but not a function
                # so keep current_function unchanged here.

            if line.startswith("\  ") : # not in an annotated FUNCTION anymore
                current_function = ""   # prevent wrong 'caller' entries

            if line.startswith("0x200") :  # Symbol table with address in RAM ?
                sym_name = line.split()[-1]
                self.vars_listed.add(sym_name) # bingo, a 'real' symbol (no hex dummy)
            if line.startswith("0x400") :  # Address of an SFR (special function register) ?
                sym_name = line.split()[-1]
                self.vars_listed.add(sym_name) # for a start, treat SFRs "like RAM"
                # (in pass 2, run though all RAM- & SFR- addresses again,
                #  and those not 'listed' in Radare2's output from "f" yet
                #  will be AUTOMATICALLY added by this Python script,
                #  to list all functions that 'use' a certain RAM location.
                #
        # <-- end of the loop to extract info from the raw assembly listing

        # Create the output file: HTML-ish, but 'almost' plain text only.
        #   No fancy templating, keep simple things simple.
        self.of = open(filename_without_extension+'.htm', 'w')
        self.writeHTMLPart1()

        # Emit a 'linked' list of disassembled functions (each function name will be an HTML anchor)
        self.of.write("<h1>Disassembled Function Overview</h1>\n")
        self.of.write("<a id='Functions'></a><ul id='MultiColumn'>\n")
        nFuncsCalledButNotListed = 0
        for sym_name in sorted(self.functions):
            if sym_name in self.funcs_listed: # not just CALLED but SHOWN in the listing:
                self.of.write('<li> <a href="#'+sym_name+'">'+sym_name+'</a><br>\n')
            else:
                nFuncsCalledButNotListed += 1
        self.of.write("</ul>\n")

        if nFuncsCalledButNotListed > 0:
            self.of.write("<h2>Functions called further below but not shown in the disassembly listing yet</h1>\n")
            self.of.write("&nbsp; To have them listed, extend the Radare2 script with 'pdf @FuncName'.<br>\n")
            self.of.write("&nbsp; Please report your findings about a function's purpose, and suggest<br>\n")
            self.of.write("&nbsp; a proper name for it at the MD380Tools group.<br>\n")
            self.of.write("<a id='Functions'></a><ul id='MultiColumn'>\n")
            for sym_name in sorted(self.functions):
                if sym_name not in self.funcs_listed: # CALLED but not SHOWN in the listing:
                    # show how many times it is referenced (e.g. "called")
                    # to decide which functions should be annotated 
                    # (in the .r file) and added to the listing:
                    nrefs = len(self.functions[sym_name])
                    if nrefs>1:
                       s = ' ('+str(nrefs)+' callers)'
                       self.of.write('<li> <font color="red"><u>'+sym_name+'</u></font>'+s+'\n')
                    else:     
                       self.of.write('<li> <font color="red"><u>'+sym_name+'</u></font>\n')
            self.of.write("</ul>\n")

        # Pass 2 : Append the listing itself, with internal links and anchors:
        self.of.write("<pre><code>")
        for line in self.in_lines:
            out_line = line
            r_parts = line.rsplit(' ',1)
            if line.startswith("/ (fcn) "):  # e.g. "/ (fcn) Reset_Handler 8"
                sym_name = line.rsplit(' ',2)[1]  # anchor for the function name (last but one)
                out_line = line.replace(sym_name, sym2anchor(sym_name) )
                if sym_name in self.functions : # known function...
                    callers = self.functions[sym_name]
                    if callers: # Are there any KNOWN callers for this function ?
                        # emit the 'fcn' line, and list this function's callers (if any)
                        self.of.write(out_line + "\n") # emit original line with 'fcn'
                        out_line = '|       Caller:'
                        # 'callers' is a SET which will be converted into a
                        #  space-separated list of hyperlinks here :
                        for caller in callers:
                           # If the line gets too long, flush it and begin
                           # a new. May look ugly in a wide-screen browser.
                           # The raw Radare2 output had over 130 columns..
                           if len(out_line) > 220:
                               self.of.write(out_line + "\n")
                               out_line = '|              '
                           if caller in self.funcs_listed:
                               out_line = out_line + ' ' + sym2link( caller )
                           else:
                               out_line = out_line + ' ' + sym2noLink( caller )
            # end if < begin of a dissassembled function >
            elif r_parts[-1] in self.functions:
                sym_name = r_parts[-1]
                # The LAST word in the line may be the name of a function.
                # Radare2 adds the name of any annotated symbol in the last
                #         column, depending on the operand, e.g.:
                # > ldr r0, [0x080f924c] ; [0x80f924c:4]=0x8094359 SystemInit
                # If the last word in a line is contained in f_names,
                # turn it into a 'clickable' or 'non-functional' link:
                if sym_name in self.funcs_listed:
                    out_line = r_parts[0] + ' ' + sym2link(sym_name)
                else: # no idea what the symbol is.. show a "bad link":
                    out_line = r_parts[0] + ' ' + sym2noLink(sym_name)

            elif line[12:14]=="0x" : # Looks like Radare2's disassembly,
                # with lots of space before the hexadecimal code address.
                # Not sure if Radare2 always aligns mnemonics to column 42..44
                # so look for subroutine calls beginning in column 40:
                token  = " bl " # 'branch with link' (subroutine call)
                column = line.find(token,40)
                if column >= 40 : # token at the right place for a mnemonic..
                    l_split = line[column:].split(token)
                    # e.g. l_split = ['', 'RCC_Init   ; blah, blah' ]
                    l_split = l_split[1].split() # skip optional comments, etc
                    sym_name = l_split[0] # separate the operand (callee after 'bl')
                    if sym_name in self.functions:
                        # if Callee is a disassembled function link to it,
                        # else show a broken link similar as above:
                        if sym_name in self.funcs_listed:
                            out_line = line.replace(sym_name, sym2link(sym_name) )
                        else:
                            out_line = line.replace(sym_name, sym2noLink(sym_name) )
                    # ( if the callee is just a hex address, we can't add a
                    #   link to it because the disassembly listing only
                    #   contains 'annotated' (named) functions.
                    #   All those appear like a 'red broken link' in Wikipedia,
                    #   in THIS case the broken link can be fixed by extending
                    #   the Radare2 script, e.g. file disasm_yhf.r or similar )

                # Decorate references to RAM with links into the symbol table:
                sym_name = getNameOrAddressOfSomethingInRAM(line)
                if sym_name in self.variables:
                   out_line = line.replace(sym_name, sym2link(sym_name) ) # sometimes 'links' fragments, anyway..
                # end if < .. something in RAM (in pass 2) >
            # end if < line could be R2 disassembly >
            elif line[12:16] == ";-- ": # another annotation in Radare2 ?
                sym_name = line[16:-1]  # exclude the trailing colon !
                if sym_name in self.functions:
                    out_line = line.replace(sym_name, sym2anchor(sym_name))
                # end if < line could have been annotated by 'f', not 'af+' >
            elif line[12:14]==" /" : # Looks like an 'annotated hexdump' (pxa)
                sym_name = line[14:].strip() # used to show font headers, etc
                if sym_name in self.functions:
                    out_line = line.replace(sym_name, sym2anchor(sym_name))
            elif line.startswith("0x200") or line.startswith("0x400") :  # RAM or SFR ?
                sym_name = line.split()[-1]
                if sym_name in self.variables:
                    out_line = line.replace(sym_name, sym2anchor(sym_name))
                    # While we're at it, also show functions that USE
                    #  this variable or special function register :
                    out_line = self.appendListOfVarRefs(out_line, sym_name)
                    
            # Because Radare2 (mis)interprets a lot of characters in the 
            # comment string after 'CCa', replace the following tokens:
            # _GT_  ->  &gt; in HTML (greater than, because R2 misinterpreted '>')
            # _LT_  ->  &lt; in HTML (less than, because R2 misinterpreted '<')
            out_line = out_line.replace(' _GT_ ', ' &gt; ' )
            out_line = out_line.replace(' _LT_ ', ' &lt; ' )
            
            self.of.write(out_line+"\n") # emit 'original' or 'modified' line

        # End of 'pass 2' for the Radare 2 .. almost done
        # Addresses in RAM (or SFR), which have not been annotated yet,
        #               will be listed in a kind of 'appendix' (below),
        #               along with the name of functions that use them.
        nVariablesUsedButNotListed = 0
        for sym_name in self.variables:
            if not sym_name in self.vars_listed: # not just CALLED but SHOWN in the listing:
                nVariablesUsedButNotListed += 1
        if nVariablesUsedButNotListed > 0:
            self.of.write("<h2>Variables and RAM/SFR locations not annotated yet</h1>")
            self.of.write(" (to annotate them, extend the Radare2 script with 'f symbol @ address')<br>\n")
            for sym_name in sorted(self.variables):
                if sym_name not in self.vars_listed: # USED but not annotated variable / RAM location / SFR :
                    # Also show WHO references (uses) it, to help discovering the purpose
                    out_line = '\n' + sym2anchor(sym_name)
                    out_line = self.appendListOfVarRefs( out_line, sym_name)
                    self.of.write( out_line )
            self.of.write("</ul>\n")

        # End of the disassembly listing, finish the HTML file:
        self.of.write("</code></pre>\n</body>\n</html>")
        self.of.close()

    # end disasm2htm()

# end class DisasmToHTML

if __name__ == "__main__":
    n_args = len(sys.argv)-1  # obviously C-like, the "program name" counts as an argument, too
    d2h = DisasmToHTML()
    if n_args==1 :    # ONE real argument: must be the name of the raw disassmbly w/o extension
       d2h.processFile(sys.argv[1])
    elif n_args == 0: # NO real argument specified : use the default, produced by disasm_yhf.r 
       d2h.processFile( 'listing' )
    else:
       print("Usage: disasm2htm <filename_without_extension> ")