import tensorflow as tf
import h5py
import os
import fnmatch
import shutil
from tqdm import tqdm
from multiprocessing import Pool
import numpy as np


def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()  # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _bool_feature(value):
    """Returns a bool_list from a boolean."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[int(value)]))


def serialize_example(
    action,
    base_action,
    qpos,
    qvel,
    cam_high,
    cam_left_wrist,
    cam_right_wrist,
    instruction,
    terminate_episode,
):
    feature = {
        "action":
        _bytes_feature(tf.io.serialize_tensor(action)),
        "base_action":
        _bytes_feature(tf.io.serialize_tensor(base_action)),
        "qpos":
        _bytes_feature(tf.io.serialize_tensor(qpos)),
        "qvel":
        _bytes_feature(tf.io.serialize_tensor(qvel)),
        "cam_high":
        _bytes_feature(tf.io.serialize_tensor(tf.convert_to_tensor(cam_high.tobytes(), dtype=tf.string))),
        "cam_left_wrist":
        _bytes_feature(tf.io.serialize_tensor(tf.convert_to_tensor(cam_left_wrist.tobytes(), dtype=tf.string))),
        "cam_right_wrist":
        _bytes_feature(tf.io.serialize_tensor(tf.convert_to_tensor(cam_right_wrist.tobytes(), dtype=tf.string))),
        "instruction":
        _bytes_feature(instruction),
        "terminate_episode":
        _bool_feature(terminate_episode),
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()


def process_hdf5_file(args):
    filepath, root_dir, out_dir = args
    output_dir = os.path.join(out_dir, os.path.relpath(os.path.dirname(filepath), root_dir))
    os.makedirs(output_dir, exist_ok=True)
    filename = os.path.basename(filepath)
    tfrecord_path = os.path.join(output_dir, filename.replace(".hdf5", ".tfrecord"))

    if os.path.exists(tfrecord_path) and os.path.getsize(tfrecord_path) > 0:
        return f"TFRecords already exist at {tfrecord_path}"
    try:
        with h5py.File(filepath, "r") as f, tf.io.TFRecordWriter(tfrecord_path) as writer:
            num_episodes = f["action"].shape[0]
            # Remove the first few still steps
            EPS = 1e-2
            qpos = f["observations"]["qpos"][:]
            # Get the idx of the first qpos whose delta exceeds the threshold
            qpos_delta = np.abs(qpos - qpos[0:1])
            indices = np.where(np.any(qpos_delta > EPS, axis=1))[0]
            if len(indices) > 0:
                first_idx = indices[0]
            else:
                raise ValueError("Found no qpos that exceeds the threshold.")

            for i in range(first_idx - 1, num_episodes):
                action = f["action"][i]
                base_action = f["base_action"][i]
                qpos = f["observations"]["qpos"][i]
                qvel = f["observations"]["qvel"][i]
                cam_high = f["observations"]["images"]["cam_high"][i]
                cam_left_wrist = f["observations"]["images"]["cam_left_wrist"][i]
                cam_right_wrist = f["observations"]["images"]["cam_right_wrist"][i]
                instruction = f["instruction"][()]
                terminate_episode = i == num_episodes - 1
                serialized_example = serialize_example(
                    action,
                    base_action,
                    qpos,
                    qvel,
                    cam_high,
                    cam_left_wrist,
                    cam_right_wrist,
                    instruction,
                    terminate_episode,
                )
                writer.write(serialized_example)
    except Exception as e:
        with open("error_log.txt", "a") as f:
            f.write(f"{filepath}\n")
        print(f"error at {filepath}: {e}")
    return f"TFRecords written to {tfrecord_path}"


def write_tfrecords(root_dir, out_dir):
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    hdf5_files = []
    for root, dirs, files in os.walk(root_dir):
        if os.path.exists(os.path.join(root, "expanded_instruction_gpt-4-turbo.json")):
            # copy the instruction file
            target_path = os.path.join(out_dir, os.path.relpath(root, root_dir))
            os.makedirs(target_path, exist_ok=True)
            shutil.copy(os.path.join(root, "expanded_instruction_gpt-4-turbo.json"), target_path)
        elif os.path.exists(os.path.join(root, "expanded_instruction.json")):
            print(root)
            target_path = os.path.join(out_dir, os.path.relpath(root, root_dir))
            os.makedirs(target_path, exist_ok=True)
            shutil.copy(os.path.join(root, "expanded_instruction.json"), target_path)
            # rename into expanded_instruction_gpt-4-turbo.json
            os.rename(
                os.path.join(
                    out_dir,
                    os.path.relpath(root, root_dir),
                    "expanded_instruction.json",
                ),
                os.path.join(
                    out_dir,
                    os.path.relpath(root, root_dir),
                    "expanded_instruction_gpt-4-turbo.json",
                ),
            )
        for filename in fnmatch.filter(files, "*.hdf5"):
            filepath = os.path.join(root, filename)
            hdf5_files.append((filepath, root_dir, out_dir))

    with Pool(16) as pool:
        max_count = len(hdf5_files)
        with tqdm(total=max_count) as pbar:
            for _ in pool.imap_unordered(process_hdf5_file, hdf5_files):
                pbar.update(1)

    print(f"TFRecords written to {out_dir}")


root_dir = "../datasets/agilex/rdt_data/"
out_dir = "../datasets/agilex/tfrecords/"
write_tfrecords(root_dir, out_dir)