You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
55 lines
1.7 KiB
55 lines
1.7 KiB
import json |
|
import tiktoken # for token counting |
|
import numpy as np |
|
from collections import defaultdict |
|
|
|
data_path = "fineTuning/finetuning_appreciations_tuning_v2.jsonl" |
|
|
|
# Load the dataset |
|
with open(data_path, 'r', encoding='utf-8') as f: |
|
dataset = [json.loads(line) for line in f] |
|
|
|
# Initial dataset stats |
|
print("Num examples:", len(dataset)) |
|
print("First example:") |
|
for message in dataset[0]["messages"]: |
|
print(message) |
|
|
|
# Format error checks |
|
format_errors = defaultdict(int) |
|
|
|
for ex in dataset: |
|
if not isinstance(ex, dict): |
|
format_errors["data_type"] += 1 |
|
continue |
|
|
|
messages = ex.get("messages", None) |
|
if not messages: |
|
format_errors["missing_messages_list"] += 1 |
|
continue |
|
|
|
for message in messages: |
|
if "role" not in message or "content" not in message: |
|
format_errors["message_missing_key"] += 1 |
|
|
|
if any(k not in ("role", "content", "name", "function_call", "weight") for k in message): |
|
format_errors["message_unrecognized_key"] += 1 |
|
|
|
if message.get("role", None) not in ("system", "user", "assistant", "function"): |
|
format_errors["unrecognized_role"] += 1 |
|
|
|
content = message.get("content", None) |
|
function_call = message.get("function_call", None) |
|
|
|
if (not content and not function_call) or not isinstance(content, str): |
|
format_errors["missing_content"] += 1 |
|
|
|
if not any(message.get("role", None) == "assistant" for message in messages): |
|
format_errors["example_missing_assistant_message"] += 1 |
|
|
|
if format_errors: |
|
print("Found errors:") |
|
for k, v in format_errors.items(): |
|
print(f"{k}: {v}") |
|
else: |
|
print("No errors found")
|
|
|