Module convert

Expand source code
import glob, json, os, argparse
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Params')
    parser.add_argument('--segment_len', type=int, default=254,
                        help='the length of each example')
    # we set this to be 254 instead of 256 because we want the input to be like: <control_code> input_ids <eos>
    parser.add_argument('--stride', type=int, default=10,
                        help='stride to split training examples')
    parser.add_argument('--dev_size', type=float, default=0.1,
                        help='split ratio of development set for each language')
    args = parser.parse_args()

    gpt2_tok = GPT2Tokenizer.from_pretrained("gpt2", do_lower_case=False)
    paths = ['Python', 'Java']
    segments = {}

    # The logic is to traverse each and every file of Python and  Java directories and segment each code
    for path in paths:
        source_files = glob.glob(f'{path}/**/*.py' if path == "Python" else f'{path}/**/*.java', recursive=True)
        for each_src in tqdm(source_files):
            with open(each_src, "r", encoding="utf-8") as f:
                code_content = f.read()
                encoded = gpt2_tok.encode(code_content)
                for i in range(len(encoded) // args.stride):
                    seg = encoded[i * args.stride:i * args.stride + args.segment_len]
                    if path not in segments:
                        segments[path] = []
                    segments[path].append(json.dumps({"token_ids": seg, "label": path}))

    #splitting of dataset into training dataset and dev dataset
    train, dev = [], []
    for key in segments:
        # we don't shuffle before splitting because we want the train and dev to be very different (less overlapping)
        tr, de = train_test_split(segments[key], test_size=args.dev_size)
        train += tr
        dev += de

    to_path = "source_code/json"
    if not os.path.isdir(to_path):
        os.makedirs(to_path)

    with open(os.path.join(to_path, "train.jsonl"), "w") as f:
        f.write("\n".join(train))

    with open(os.path.join(to_path, "dev.jsonl"), "w") as f:
        f.write("\n".join(dev))