| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538 |
- import json
- import logging
- from openai import OpenAI
- from .lm_utils import run_chatgpt_query_multi_turn
- from .openai_helpers import get_response
- logging.basicConfig(
- format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
- datefmt='%m/%d/%Y %H:%M:%S',
- level=logging.INFO,
- )
- logger = logging.getLogger(__name__)
- def get_score_from_answer(type, answer):
- if type == 'context':
- answer = answer.replace('Answer:', '').strip()
- if answer.startswith('A)'):
- return 1.0
- elif answer.startswith('B)'):
- return 0.0
- return -1.0
- elif type == 'var':
- try:
- var_json = json.loads(answer)
- # print(f"var_json:{var_json}")
- p = 0.0
- r = 0.0
- f1 = 0.0
- if var_json['sizeB']:
- p = var_json['intersection'] / var_json['sizeB']
- if var_json['sizeA']:
- r = var_json['intersection'] / var_json['sizeA']
- if p > 0.0 and r > 0.0:
- f1 = (2 * p * r) / (p + r)
- else:
- f1 = 0.0
- eval_rec = {
- 'p': p,
- 'r': r,
- 'f1': f1,
- 'sizeA': var_json['sizeA'],
- 'sizeB': var_json['sizeB'],
- 'intersection': var_json['intersection'],
- 'explanation': var_json['explanation'],
- }
- print(f'var_eval: {eval_rec}')
- return eval_rec
- except Exception: # COMMENT: added Exception
- return {'p': -1.0, 'r': -1.0, 'f1': -1.0}
- elif type == 'rel':
- print(answer)
- rel_json = json.loads(answer)
- answer_str = rel_json['answer'].strip()
- if answer_str.startswith('A') or 'very similar' in answer_str:
- return 1.0
- elif (
- answer_str.startswith('B') or 'similar but general than HypoA' in answer_str
- ):
- return 0.5
- elif answer_str.startswith('C') or 'different' in answer_str:
- return 0.0
- return -1.0
- return -1.0
- def ask_dimension_question(
- query,
- gold_hypo,
- gold_workflow,
- gen_hypo,
- gen_workflow,
- dataset_meta,
- llm_used,
- dimension,
- dataset_type,
- use_column_metadata=True,
- ):
- dimension_question = ''
- answer = ''
- score = 0.0
- if dimension == 'var':
- score = {'p': -1.0, 'r': -1.0, 'f1': -1.0}
- num_tokens = 256
- num_retries = 1
- json_response = False
- messages = [
- {
- 'role': 'system',
- 'content': 'You are an AI assistant that helps evaluate a data-driven hypothesis. You are a helpful assistant who is not talkative. You only respond with the exact answer to a query without additional conversation.',
- },
- ]
- if dimension == 'context':
- dimension_question = """\
- Question: Is HypoB defined in the same context as HypoA?
- (Context refers to assumptions/stratification under which the hypotheses are defined.)
- Options: A) same B) different
- What is your answer?"""
- elif dimension == 'var':
- dimension_question = """\
- Question: For both HypoA and HypoB, what are the different variables found in the hypotheses? \
- Return your answer as a JSON object in the following format:
- ```json
- {{
- "sizeA": num of variables used in HypoA
- "sizeB": num of variables used in HypoB
- "intersection": num of variables common in HypoA and HypoB. Use *fuzzy matching* to determine intersection, accounting for paraphrases or slightly different surface forms
- "explanation": a short text explanation about the variables
- }}```
- Answer:"""
- num_tokens = 512
- num_retries = 1
- json_response = True
- elif dimension == 'rel':
- dimension_question = """\
- Question: Does HypoB exhibit the same relation as HypoA?
- Compare using following example hierarchy of relationships (based on specificity): \
- "there exists a relationship" > "positive relationship" > "positive AND (linear OR quadratic)" > "positive AND linear".
- Options: A) very similar B) similar but general than HypoA C) different
- Return your answer as a JSON object in the following format:
- ```json
- {{
- "answer": one of the options from A) very similar B) similar but general than HypoA C) different
- "explanation": a short text explanation about the relationship comparison
- }}```
- Answer:"""
- num_tokens = 512
- num_retries = 1
- json_response = True
- datasets_json = prepare_dataset_metadata_json(
- dataset_meta, dataset_type=dataset_type, use_column_metadata=use_column_metadata
- )
- dimension_question_str = f"""\
- You are going to compare two natural-language hypotheses HypoA and HypoB accompanied with optional workflows: WorkflowA for HypoA and WorkflowB for HypoB. \
- Both the hypotheses answer the natural language query "QUERY" over the dataset(s) described by dataset description(s) and column description(s) below. \
- Compare HypoA and HypoB in terms of three aspects: Contexts, Variables, and Relations. \
- E.g., for the hypothesis "From 1995 to 2009, the number of sandhill cranes around the tundra (Indigilka River) surged by an astounding ~10X":
- * Contexts refer to stratification of the data under which the given hypothesis is True. E.g., "For all women", "From 1995 to 2009".
- * Variables refer to the set of variables (either dependent or independent) that are mentioned in the hypothesis. E.g., number of sandhill cranes, location.
- * Relations refer to the form of relation between the variables. E.g., "surged by ~10x".
- Answer following questions for a given pair of hypotheses, HypoA and HypoB, along with an explanation grounded on the QUERY and the DATASET(S).
- Here is the metadata for the task:
- ```json
- {{
- "datasets": {datasets_json},
- "query": {query},
- "HypoA": {gold_hypo},
- "WorkflowA": {gold_workflow},
- "HypoB": {gen_hypo},
- "WorkflowB": {gen_workflow}
- }}
- ```
- {dimension_question}"""
- messages.append({'role': 'user', 'content': dimension_question_str})
- for retry in range(num_retries):
- response = run_chatgpt_query_multi_turn(
- messages=messages,
- model_name=llm_used,
- max_tokens=num_tokens,
- temperature=0, # 0 for greedy best decoding
- json_response=json_response,
- )
- if response is not None: # COMMENT: changed from != to is not
- break
- if response is not None: # COMMENT: changed from != to is not
- answer = response.choices[0].message.content.strip()
- score = get_score_from_answer(type=dimension, answer=answer)
- return dimension_question, answer, score
- def prepare_dataset_metadata_json(dataset_meta, dataset_type, use_column_metadata=True):
- if dataset_meta is None: # COMMENT: changed from == to is None
- return [
- {
- 'dataset_description': '',
- 'columns': [],
- }
- ]
- datasets_json = []
- if dataset_type == 'real':
- for d in dataset_meta['datasets']:
- datasets_json.append(
- {
- 'dataset_description': d['description'],
- 'columns': [
- {'name': col['name'], 'description': col['description']}
- for col in d['columns']['raw']
- ]
- if use_column_metadata
- else [],
- }
- )
- else:
- for d in dataset_meta['datasets']:
- datasets_json.append(
- {
- 'dataset_description': d['description'],
- 'columns': [
- {'name': col['name'], 'description': col['description']}
- for col in d['columns']
- ]
- if use_column_metadata
- else [],
- }
- )
- return datasets_json
- def get_sub_hypotheses(
- query,
- hypo,
- workflow,
- dataset_meta,
- llm_used,
- dataset_type,
- use_column_metadata=True,
- ):
- client = OpenAI()
- extraction_prompt = """\
- Given a set of dataset columns, a ground-truth hypothesis, and the analysis workflow used, your task is to extract three dimensions that define the hypothesis: Context, Variables, and Relations. \
- Here are the definitions for these dimensions:
- - Contexts: Boundary conditions that limit the scope of a hypothesis. E.g., “for men over \
- the age of 30”, “in Asia and Europe”. If the context applies to the full dataset, then extract the context from the dataset_descrption.
- - Variables: Known concepts that interact in a meaningful way under a given context to \
- produce the hypothesis. E.g., gender, age, income, or "None" if there is no interacting variable.
- - Relations: Interactions between a given set of variables under a given context to produce \
- the hypothesis. E.g., “quadratic relationship”, “inversely proportional”, piecewise conditionals, \
- or "None" if there is no interacting relationship.
- Make sure to only use the information present in the hypothesis and the workflow. Do not add any new information. \
- For each dimension, be specific, and do not omit any important details.
- Here is the metadata for the task:
- ```json
- {
- "datasets": %s,
- "hypothesis": "%s",
- "workflow": "%s"
- }
- ```
- Return your answer as a JSON object in the following format:
- ```json
- {
- "sub_hypo": [
- {
- "text": the hypothesis in natural language,
- "context": a short text description of the context of the hypothesis,
- "variables": a list of columns involved in the hypothesis,
- "relations": a short text description of the relationship between the variables of the hypothesis
- },
- ...
- ]
- }```
- """
- datasets_json = prepare_dataset_metadata_json(
- dataset_meta, dataset_type, use_column_metadata=use_column_metadata
- )
- _prompt = extraction_prompt % (datasets_json, hypo, workflow)
- sub_hypo_json = get_response(client, _prompt, model=llm_used, max_retry=1)
- if sub_hypo_json is not None: # COMMENT: changed from != to is not
- # print(f"full hypothesis: {hypo}")
- print(f'sub_hypo_json: {sub_hypo_json}')
- else:
- sub_hypo_json = {
- 'sub_hypo': [],
- }
- sub_hypo_json['full_hypo'] = hypo
- return sub_hypo_json
- def match_context_with_gpt(
- gold_hyp, gold_context, pred_hyp, pred_context, model='gpt-3.5-turbo'
- ):
- prompt = f"""\
- Given a gold hypothesis, a gold context, a predicted hypothesis, and a predicted context, your task is \
- to determine if the predicted context semantically matches the ground-truth context. \
- Here is the definition for Context: Boundary conditions that limit the scope of a sub-hypothesis. E.g., “for men over the age of 30”, “in Asia and Europe”. If the context applies to the full dataset, then the context is derived from the dataset_descrption. \
- Here is the definition for Context: Boundary conditions that limit the scope of a sub-hypothesis. E.g., “for men over the age of 30”, “in Asia and Europe”. If the context applies to the full dataset, then the context is derived from the dataset_descrption. \
- If the predicted context matches the gold context, return true, otherwise return false.
- If both gold and predicted hypotheses are defined over the context of the full dataset, then also return true.
- If both gold and predicted hypotheses are defined over the context of the full dataset, then also return true.
- Here is the metadata for the task:
- ```json
- {{
- "gold_hypothesis": "{gold_hyp}",
- "gold_context": "{gold_context}",
- "predicted_hypothesis": "{pred_hyp}",
- "predicted_context": "{pred_context}"
- }}
- ```
- Return your answer as a JSON object in the following format:
- ```json
- {{
- "match": true or false
- }}
- ```"""
- client = OpenAI()
- output = get_response(client, prompt, model=model)
- return output.get('match', False)
- def is_matching_context(gold_hyp, gold_context, pred_hyp, pred_context, llm_used):
- if gold_context == pred_context:
- return True
- if 'None' in [gold_context, pred_context]:
- return False
- return match_context_with_gpt(
- gold_hyp, gold_context, pred_hyp, pred_context, model=llm_used
- )
- def run_eval_gold_vs_gen_NL_subhypo(
- query,
- gold_hypo,
- gold_workflow,
- gen_hypo,
- gen_workflow,
- dataset_meta,
- llm_used,
- context_score,
- dataset_type,
- use_column_metadata=True,
- ):
- # GPT-4 based evaluation to evaluate generated hypothesis in terms of context, variables, relation
- eval_rec = {
- 'query': query,
- 'HypoA': gold_hypo,
- 'WorkflowA': gold_workflow,
- 'HypoB': gen_hypo,
- 'WorkflowB': gen_workflow,
- }
- for dimension in ['var', 'rel']:
- question, answer, score = ask_dimension_question(
- query,
- gold_hypo,
- gold_workflow,
- gen_hypo,
- gen_workflow,
- dataset_meta,
- llm_used,
- dimension=dimension,
- dataset_type=dataset_type,
- use_column_metadata=use_column_metadata,
- )
- eval_rec[dimension] = {'question': question, 'answer': answer, 'score': score}
- eval_rec['context'] = context_score
- eval_rec['accuracy_score'] = (
- 1.0
- * eval_rec['context']['score']
- * eval_rec['var']['score']['f1']
- * eval_rec['rel']['score']
- )
- return eval_rec
- def run_eval_gold_vs_gen_NL_hypo_workflow(
- query,
- gold_hypo,
- gold_workflow,
- gen_hypo,
- gen_workflow,
- dataset_meta,
- llm_used,
- dataset_type,
- use_column_metadata=True,
- ):
- # Input: Dataset Metadata, Query, Gold {Hg, Wg}, Predicted {Hp, Wp}
- # Output: eval_rec json includes final_score
- # Procedure:
- # Dataset Metadata, Query, Gold {Hg, Wg}, Pred {Hg, Wg}
- # Gold: [Hg1, Hg2] (compute on the fly) Hg1 is a NL form of subhypothesis
- # Predicted: [Hp1, Hp2] (compute on the fly)
- # Compute Intersection: [(Hg_i, Hp_j), …] # tuples of (gold,pred) that matched with context (do this w/o explicit extraction)
- # # filter so that a gold context and a predicted context are only attached to one tuple
- # Compute recall_context (programmatically)
- # r_v_list = []
- # For (Hg_i, Hp_j) in the intersection:
- # With Hg_i, Hp_j in NL, ask GPT4 → #variables and #intersection and a paragraph explanation and programmatically calculate f1_v
- # Hg_i, Hp_j in NL, ask GPT4 → matching score (0, 0.5 or 1) : A) very similar B) similar but general than HypoA C) different + explanation
- # r_v_list ← f1_v * score_r
- # accuracy_score = mean(r_v_list)
- # score = [ recall_context * mean over predicted context(context_score * var_score *rel_score )]
- # recall_context = 1.0 # COMMENT: never used
- eval_rec = {
- 'query': query,
- 'HypoA': gold_hypo,
- 'WorkflowA': gold_workflow,
- 'HypoB': gen_hypo,
- 'WorkflowB': gen_workflow,
- }
- gold_sub_hypo_json = get_sub_hypotheses(
- query=query,
- hypo=gold_hypo,
- workflow=gold_workflow,
- dataset_meta=dataset_meta,
- llm_used=llm_used,
- dataset_type=dataset_type,
- use_column_metadata=use_column_metadata,
- )
- if len(gold_sub_hypo_json['sub_hypo']) == 0:
- gold_sub_hypo_json['sub_hypo'] = [
- {
- 'text': gold_hypo,
- 'context': 'None',
- 'variables': [],
- 'relations': '',
- 'explanation': 'unable to segment',
- }
- ]
- print(f'gold_sub_hypo_json: {gold_sub_hypo_json}')
- gen_sub_hypo_json = get_sub_hypotheses(
- query=query,
- hypo=gen_hypo,
- workflow=gen_workflow,
- dataset_meta=dataset_meta,
- llm_used=llm_used,
- dataset_type=dataset_type,
- use_column_metadata=use_column_metadata,
- )
- if len(gen_sub_hypo_json['sub_hypo']) == 0:
- gen_sub_hypo_json['sub_hypo'] = [
- {
- 'text': gen_hypo,
- 'context': 'None',
- 'variables': [],
- 'relations': '',
- 'explanation': 'unable to segment',
- }
- ]
- print(f'gen_sub_hypo_json: {gen_sub_hypo_json}')
- eval_rec['gold_sub_hypo'] = gold_sub_hypo_json
- eval_rec['gen_sub_hypo'] = gen_sub_hypo_json
- gold_subh_covered = []
- gen_subh_to_gold_subh = dict()
- gen_gold_subh_to_context = dict()
- for p_id, gen_subh in enumerate(gen_sub_hypo_json['sub_hypo']):
- gen_subh_to_gold_subh[p_id] = -1
- for g_id, gold_subh in enumerate(gold_sub_hypo_json['sub_hypo']):
- if g_id in gold_subh_covered:
- continue
- # match context
- context_bool = is_matching_context(
- gold_subh['text'],
- gold_subh.get('context', ''),
- gen_subh['text'],
- gen_subh.get('context', ''),
- llm_used,
- )
- if context_bool:
- context_score = 1.0
- else:
- context_score = 0.0
- if context_score == 1.0: # match only when context_score = 1.0
- gen_subh_to_gold_subh[p_id] = g_id
- gold_subh_covered.append(g_id)
- gen_gold_subh_to_context[f'P{p_id}||G{g_id}'] = {
- 'question': f"""Comapring: GoldH: {gold_subh["text"]}, GoldC: {gold_subh['context']}\nGenH: {gen_subh['text']}, GenC: {gen_subh['context']}""",
- 'answer': context_bool,
- 'score': context_score,
- }
- break
- print(f'gen_subh_to_gold_subh: {gen_subh_to_gold_subh}')
- eval_rec['gen_subh_to_gold_subh'] = gen_subh_to_gold_subh
- eval_rec['gold_subh_covered'] = gold_subh_covered
- matched_gold_gen_subh_evals = dict()
- sum_accuracy_score = 0.0
- for p_id, g_id in gen_subh_to_gold_subh.items():
- if g_id >= 0:
- key = f'P{p_id}||G{g_id}'
- context_score = gen_gold_subh_to_context[key]
- subh_eval_rec = run_eval_gold_vs_gen_NL_subhypo(
- query,
- gold_hypo,
- gold_workflow,
- gen_hypo,
- gen_workflow,
- dataset_meta,
- llm_used,
- context_score,
- dataset_type=dataset_type,
- use_column_metadata=use_column_metadata,
- )
- sum_accuracy_score += subh_eval_rec['accuracy_score']
- matched_gold_gen_subh_evals[key] = subh_eval_rec
- eval_rec['matched_gold_gen_subh_evals'] = matched_gold_gen_subh_evals
- eval_rec['recall_context'] = (
- len(gold_subh_covered) / len(gold_sub_hypo_json['sub_hypo'])
- if len(gold_sub_hypo_json['sub_hypo'])
- else 0.0
- )
- mean_accuracy_score = (
- sum_accuracy_score / len(gen_subh_to_gold_subh)
- if len(gen_subh_to_gold_subh)
- else 0.0
- )
- eval_rec['mean_accuracy_score'] = mean_accuracy_score
- final_score = eval_rec['recall_context'] * mean_accuracy_score
- eval_rec['final_score'] = final_score
- print(f'eval_rec: {json.dumps(eval_rec, indent=2)}')
- return eval_rec
|