luau/tools/codesizeprediction.py

#!/usr/bin/python3
# This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details

# NOTE: This script is experimental. This script uses a linear regression to construct a model for predicting native
# code size from bytecode. Some initial work has been done to analyze a large corpus of Luau scripts, and while for
# most functions the model predicts the native code size quite well (+/-25%), there are many cases where the predicted
# size is off by as much as 13x. Notably, the predicted size is generally better for smaller functions and worse for 
# larger functions. Therefore, in its current form this analysis is probably not suitable for use as a basis for 
# compilation heuristics. A nonlinear model may produce better results. The script here exists as a foundation for 
# further exploration.


import json
import glob
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import argparse


def readStats(statsFileGlob):
    '''Reads files matching the supplied glob.
    Files should be generated by the Compile.cpp CLI'''

    statsFiles = glob.glob(statsFileGlob, recursive=True)

    print("Reading %s files." % len(statsFiles))

    df_dict = {
        "statsFile": [],
        "script": [],
        "name": [],
        "line": [],
        "bcodeCount": [],
        "irCount": [],
        "asmCount": [],
        "bytecodeSummary": []
    }

    for statsFile in statsFiles:
        stats = json.loads(Path(statsFile).read_text())
        for script, filestats in stats.items():
            for funstats in filestats["lowerStats"]["functions"]:
                df_dict["statsFile"].append(statsFile)
                df_dict["script"].append(script)
                df_dict["name"].append(funstats["name"])
                df_dict["line"].append(funstats["line"])
                df_dict["bcodeCount"].append(funstats["bcodeCount"])
                df_dict["irCount"].append(funstats["irCount"])
                df_dict["asmCount"].append(funstats["asmCount"])
                df_dict["bytecodeSummary"].append(
                    tuple(funstats["bytecodeSummary"][0]))

    return pd.DataFrame.from_dict(df_dict)


def addFunctionCount(df):
    df2 = df.drop_duplicates(subset=['asmCount', 'bytecodeSummary'], ignore_index=True).groupby(
        ['bytecodeSummary']).size().reset_index(name='functionCount')
    return df.merge(df2, on='bytecodeSummary', how='left')

# def deduplicateDf(df):
#    return df.drop_duplicates(subset=['bcodeCount', 'asmCount', 'bytecodeSummary'], ignore_index=True)


def randomizeDf(df):
    return df.sample(frac=1)


def splitSeq(seq):
    n = len(seq) // 2
    return (seq[:n], seq[n:])


def trainAsmSizePredictor(df):
    XTrain, XValidate = splitSeq(
        np.array([list(seq) for seq in df.bytecodeSummary]))
    YTrain, YValidate = splitSeq(np.array(df.asmCount))

    reg = LinearRegression(
        positive=True, fit_intercept=False).fit(XTrain, YTrain)
    YPredict1 = reg.predict(XTrain)
    YPredict2 = reg.predict(XValidate)

    trainRmse = np.sqrt(np.mean((np.array(YPredict1) - np.array(YTrain))**2))
    predictRmse = np.sqrt(
        np.mean((np.array(YPredict2) - np.array(YValidate))**2))

    print(f"Score: {reg.score(XTrain, YTrain)}")
    print(f"Training RMSE: {trainRmse}")
    print(f"Prediction RMSE: {predictRmse}")
    print(f"Model Intercept: {reg.intercept_}")
    print(f"Model Coefficients:\n{reg.coef_}")

    df.loc[:, 'asmCountPredicted'] = np.concatenate(
        (YPredict1, YPredict2)).round().astype(int)
    df['usedForTraining'] = np.concatenate(
        (np.repeat(True, YPredict1.size), np.repeat(False, YPredict2.size)))
    df['diff'] = df['asmCountPredicted'] - df['asmCount']
    df['diffPerc'] = (100 * df['diff']) / df['asmCount']
    df.loc[(df["diffPerc"] == np.inf), 'diffPerc'] = 0.0
    df['diffPerc'] = df['diffPerc'].round()

    return (reg, df)


def saveModel(reg, file):
    f = open(file, "w")
    f.write(f"Intercept: {reg.intercept_}\n")
    f.write(f"Coefficients: \n{reg.coef_}\n")
    f.close()


def bcodeVsAsmPlot(df, plotFile=None, minBcodeCount=None, maxBcodeCount=None):
    if minBcodeCount is None:
        minBcodeCount = df.bcodeCount.min()
    if maxBcodeCount is None:
        maxBcodeCount = df.bcodeCount.max()

    subDf = df[(df.bcodeCount <= maxBcodeCount) &
               (df.bcodeCount >= minBcodeCount)]

    plt.scatter(subDf.bcodeCount, subDf.asmCount)
    plt.title("ASM variation by Bytecode")
    plt.xlabel("Bytecode Instruction Count")
    plt.ylabel("ASM Instruction Count")

    if plotFile is not None:
        plt.savefig(plotFile)

    return plt


def predictionErrorPlot(df, plotFile=None, minPerc=None, maxPerc=None, bins=200):
    if minPerc is None:
        minPerc = df['diffPerc'].min()
    if maxPerc is None:
        maxPerc = df['diffPerc'].max()

    plotDf = df[(df["usedForTraining"] == False) & (
        df["diffPerc"] >= minPerc) & (df["diffPerc"] <= maxPerc)]

    plt.hist(plotDf["diffPerc"], bins=bins)
    plt.title("Prediction Error Distribution")
    plt.xlabel("Prediction Error %")
    plt.ylabel("Function Count")

    if plotFile is not None:
        plt.savefig(plotFile)

    return plt


def parseArgs():
    parser = argparse.ArgumentParser(
        prog='codesizeprediction.py',
        description='Constructs a linear regression model to predict native instruction count from bytecode opcode distribution')
    parser.add_argument("fileglob",
                        help="glob pattern for stats files to be used for training")
    parser.add_argument("modelfile",
                        help="text file to save model details")
    parser.add_argument("--nativesizefig",
                        help="path for saving the plot showing the variation of native code size with bytecode")
    parser.add_argument("--predictionerrorfig",
                        help="path for saving the plot showing the distribution of prediction error")
    return parser.parse_args()


if __name__ == "__main__":
    args = parseArgs()

    df0 = readStats(args.fileglob)
    df1 = addFunctionCount(df0)
    df2 = randomizeDf(df1)

    plt = bcodeVsAsmPlot(df2, args.nativesizefig, 0, 100)
    plt.show()

    (reg, df4) = trainAsmSizePredictor(df2)
    saveModel(reg, args.modelfile)

    plt = predictionErrorPlot(df4, args.predictionerrorfig, -200, 200)
    plt.show()
Sync to upstream/release/608 (#1145) # Old Solver:  - Fix a bug in the old solver where a user could use the keyword `typeof` as the name of a type alias. - Fix stringification of scientific notation to omit a trailing decimal place when not followed by a digit e.g. `1.e+20` -> `1e+20` # New Solver - Continuing work on the New non-strict mode - Introduce `keyof` and `rawkeyof` type function for acquiring the type of all keys in a table or class (https://github.com/luau-lang/rfcs/pull/16) --- Co-authored-by: Aaron Weiss <aaronweiss@roblox.com> Co-authored-by: Alexander McCord <amccord@roblox.com> Co-authored-by: Andy Friesen <afriesen@roblox.com> Co-authored-by: Aviral Goel <agoel@roblox.com> Co-authored-by: Lily Brown <lbrown@roblox.com> Co-authored-by: Vyacheslav Egorov <vegorov@roblox.com> Co-authored-by: Vighnesh Vijay <vvijay@roblox.com> --------- Co-authored-by: Aaron Weiss <aaronweiss@roblox.com> Co-authored-by: Alexander McCord <amccord@roblox.com> Co-authored-by: Andy Friesen <afriesen@roblox.com> Co-authored-by: Aviral Goel <agoel@roblox.com> Co-authored-by: David Cope <dcope@roblox.com> Co-authored-by: Lily Brown <lbrown@roblox.com> Co-authored-by: Vyacheslav Egorov <vegorov@roblox.com> 2024-01-13 06:25:27 +08:00			`#!/usr/bin/python3`
			`# This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details`

			`# NOTE: This script is experimental. This script uses a linear regression to construct a model for predicting native`
			`# code size from bytecode. Some initial work has been done to analyze a large corpus of Luau scripts, and while for`
			`# most functions the model predicts the native code size quite well (+/-25%), there are many cases where the predicted`
			`# size is off by as much as 13x. Notably, the predicted size is generally better for smaller functions and worse for`
			`# larger functions. Therefore, in its current form this analysis is probably not suitable for use as a basis for`
			`# compilation heuristics. A nonlinear model may produce better results. The script here exists as a foundation for`
			`# further exploration.`


			`import json`
			`import glob`
			`from pathlib import Path`
			`import pandas as pd`
			`import numpy as np`
			`from sklearn.linear_model import LinearRegression`
			`import matplotlib.pyplot as plt`
			`import argparse`


			`def readStats(statsFileGlob):`
			`'''Reads files matching the supplied glob.`
			`Files should be generated by the Compile.cpp CLI'''`

			`statsFiles = glob.glob(statsFileGlob, recursive=True)`

			`print("Reading %s files." % len(statsFiles))`

			`df_dict = {`
			`"statsFile": [],`
			`"script": [],`
			`"name": [],`
			`"line": [],`
			`"bcodeCount": [],`
			`"irCount": [],`
			`"asmCount": [],`
			`"bytecodeSummary": []`
			`}`

			`for statsFile in statsFiles:`
			`stats = json.loads(Path(statsFile).read_text())`
			`for script, filestats in stats.items():`
			`for funstats in filestats["lowerStats"]["functions"]:`
			`df_dict["statsFile"].append(statsFile)`
			`df_dict["script"].append(script)`
			`df_dict["name"].append(funstats["name"])`
			`df_dict["line"].append(funstats["line"])`
			`df_dict["bcodeCount"].append(funstats["bcodeCount"])`
			`df_dict["irCount"].append(funstats["irCount"])`
			`df_dict["asmCount"].append(funstats["asmCount"])`
			`df_dict["bytecodeSummary"].append(`
			`tuple(funstats["bytecodeSummary"][0]))`

			`return pd.DataFrame.from_dict(df_dict)`


			`def addFunctionCount(df):`
			`df2 = df.drop_duplicates(subset=['asmCount', 'bytecodeSummary'], ignore_index=True).groupby(`
			`['bytecodeSummary']).size().reset_index(name='functionCount')`
			`return df.merge(df2, on='bytecodeSummary', how='left')`

			`# def deduplicateDf(df):`
			`# return df.drop_duplicates(subset=['bcodeCount', 'asmCount', 'bytecodeSummary'], ignore_index=True)`


			`def randomizeDf(df):`
			`return df.sample(frac=1)`


			`def splitSeq(seq):`
			`n = len(seq) // 2`
			`return (seq[:n], seq[n:])`


			`def trainAsmSizePredictor(df):`
			`XTrain, XValidate = splitSeq(`
			`np.array([list(seq) for seq in df.bytecodeSummary]))`
			`YTrain, YValidate = splitSeq(np.array(df.asmCount))`

			`reg = LinearRegression(`
			`positive=True, fit_intercept=False).fit(XTrain, YTrain)`
			`YPredict1 = reg.predict(XTrain)`
			`YPredict2 = reg.predict(XValidate)`

			`trainRmse = np.sqrt(np.mean((np.array(YPredict1) - np.array(YTrain))**2))`
			`predictRmse = np.sqrt(`
			`np.mean((np.array(YPredict2) - np.array(YValidate))**2))`

			`print(f"Score: {reg.score(XTrain, YTrain)}")`
			`print(f"Training RMSE: {trainRmse}")`
			`print(f"Prediction RMSE: {predictRmse}")`
			`print(f"Model Intercept: {reg.intercept_}")`
			`print(f"Model Coefficients:\n{reg.coef_}")`

			`df.loc[:, 'asmCountPredicted'] = np.concatenate(`
			`(YPredict1, YPredict2)).round().astype(int)`
			`df['usedForTraining'] = np.concatenate(`
			`(np.repeat(True, YPredict1.size), np.repeat(False, YPredict2.size)))`
			`df['diff'] = df['asmCountPredicted'] - df['asmCount']`
			`df['diffPerc'] = (100 * df['diff']) / df['asmCount']`
			`df.loc[(df["diffPerc"] == np.inf), 'diffPerc'] = 0.0`
			`df['diffPerc'] = df['diffPerc'].round()`

			`return (reg, df)`


			`def saveModel(reg, file):`
			`f = open(file, "w")`
			`f.write(f"Intercept: {reg.intercept_}\n")`
			`f.write(f"Coefficients: \n{reg.coef_}\n")`
			`f.close()`


			`def bcodeVsAsmPlot(df, plotFile=None, minBcodeCount=None, maxBcodeCount=None):`
			`if minBcodeCount is None:`
			`minBcodeCount = df.bcodeCount.min()`
			`if maxBcodeCount is None:`
			`maxBcodeCount = df.bcodeCount.max()`

			`subDf = df[(df.bcodeCount <= maxBcodeCount) &`
			`(df.bcodeCount >= minBcodeCount)]`

			`plt.scatter(subDf.bcodeCount, subDf.asmCount)`
			`plt.title("ASM variation by Bytecode")`
			`plt.xlabel("Bytecode Instruction Count")`
			`plt.ylabel("ASM Instruction Count")`

			`if plotFile is not None:`
			`plt.savefig(plotFile)`

			`return plt`


			`def predictionErrorPlot(df, plotFile=None, minPerc=None, maxPerc=None, bins=200):`
			`if minPerc is None:`
			`minPerc = df['diffPerc'].min()`
			`if maxPerc is None:`
			`maxPerc = df['diffPerc'].max()`

			`plotDf = df[(df["usedForTraining"] == False) & (`
			`df["diffPerc"] >= minPerc) & (df["diffPerc"] <= maxPerc)]`

			`plt.hist(plotDf["diffPerc"], bins=bins)`
			`plt.title("Prediction Error Distribution")`
			`plt.xlabel("Prediction Error %")`
			`plt.ylabel("Function Count")`

			`if plotFile is not None:`
			`plt.savefig(plotFile)`

			`return plt`


			`def parseArgs():`
			`parser = argparse.ArgumentParser(`
			`prog='codesizeprediction.py',`
			`description='Constructs a linear regression model to predict native instruction count from bytecode opcode distribution')`
			`parser.add_argument("fileglob",`
			`help="glob pattern for stats files to be used for training")`
			`parser.add_argument("modelfile",`
			`help="text file to save model details")`
			`parser.add_argument("--nativesizefig",`
			`help="path for saving the plot showing the variation of native code size with bytecode")`
			`parser.add_argument("--predictionerrorfig",`
			`help="path for saving the plot showing the distribution of prediction error")`
			`return parser.parse_args()`


			`if __name__ == "__main__":`
			`args = parseArgs()`

			`df0 = readStats(args.fileglob)`
			`df1 = addFunctionCount(df0)`
			`df2 = randomizeDf(df1)`

			`plt = bcodeVsAsmPlot(df2, args.nativesizefig, 0, 100)`
			`plt.show()`

			`(reg, df4) = trainAsmSizePredictor(df2)`
			`saveModel(reg, args.modelfile)`

			`plt = predictionErrorPlot(df4, args.predictionerrorfig, -200, 200)`
			`plt.show()`