openai_semantic_gen_prompts.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. common_hypothesis_features = [
  2. '1-2 sentences',
  3. 'surprising finding',
  4. 'includes numeric concepts',
  5. 'includes categorical concepts',
  6. 'includes binary concepts',
  7. ]
  8. hypothesis_features = [
  9. ['requires within-cluster analysis'],
  10. ['requires across-cluster analysis'],
  11. ['corresponds to a polynomial relationship of some columns'],
  12. ['corresponds to a ratio between some columns'],
  13. ['requires temporal analysis'],
  14. ['relationship is based on descriptive statistics of some columns'],
  15. ['requires concepts based on percentage or percentiles'],
  16. ['relationship is only applicable to one cluster in the data and not the others'],
  17. ]
  18. column_features = [
  19. [
  20. 'must have one target column',
  21. 'must have quantifiable columns',
  22. 'must have a few categorical columns',
  23. 'make sure the categorical column values do not contain special characters',
  24. 'include a few distractor columns',
  25. ]
  26. ]
  27. common_pandas_features = [
  28. 'must be executable using python `eval` to create the target column in variable `df` (pandas dataframe)',
  29. "for e.g., df['A']**2 + 3*df['B'] + 9, np.where(df['A'] > 3, 'Yes', 'No'), etc.",
  30. 'variables in pandas_expression must be from the existing columns listed above',
  31. 'variables in pandas_expression must NOT contain the target column itself',
  32. ]
  33. pandas_features = [
  34. ['expression is a quadratic polynomial'],
  35. ['expression is a cubic polynomial'],
  36. ['expression is a ratio of existing columns'],
  37. ['expression is derived through logical combination of existing columns'],
  38. # workflow
  39. ]
  40. pandas_features = [common_pandas_features + p for p in pandas_features]
  41. common_derived_features = [
  42. '1-2 sentences',
  43. 'includes numeric concepts',
  44. 'includes categorical concepts',
  45. 'includes binary concepts',
  46. ]
  47. derived_features = [common_derived_features + h for h in hypothesis_features]
  48. hypothesis_features = [common_hypothesis_features + h for h in hypothesis_features]
  49. PROMPT_HYP = """\
  50. Given a dataset topic and description, generate an interesting hypothesis based on \
  51. the provided instructions. Be creative and come up with an unusual finding.
  52. ```json
  53. {
  54. "topic": "%s",
  55. "description": "%s",
  56. "hypothesis_features": %s,
  57. "hypothesis": "..."
  58. }```
  59. Give your answer as a new JSON with the following format:
  60. ```json
  61. {
  62. "hypothesis": "..."
  63. }
  64. ```"""
  65. PROMPT_COL = """\
  66. Given a dataset topic, its description, and a true hypothesis that can be determined from it, \
  67. generate a list of valid columns based on the provided instructions.
  68. ```json
  69. {
  70. "topic": "%s",
  71. "description": "%s",
  72. "hypothesis": "%s",
  73. "column_instructions": %s,
  74. "columns": [
  75. {
  76. "col_name": "...", # should be an "_"-separated string
  77. "description": "...",
  78. "data_type": "...", # should be executable using python's `eval` function. E.g., str, float, int, bool
  79. "data_range": {...}, # should be either {"min": ..., "max": ...} or {"values": [...]}
  80. "is_distractor": true/false, # boolean indicating whether this is a distractor that could cause confusion during data analysis
  81. "is_target": true/false # boolean indicating whether this is the target variable for the hypothesis; at least one column should be the target
  82. },
  83. ...
  84. ],
  85. "pandas_instructions": %s,
  86. "pandas_equation_for_hypothesis": {
  87. "target_col": "...",
  88. "target_col_type": "...",
  89. "target_col_range": {...},
  90. "independent_cols_in_pandas_expression": [], # list of column names that will be used to derive the target column
  91. "pandas_expression": "..." # expression to derive df[target_col] using df[ind_col1], df[ind_col2], etc.
  92. }
  93. }```
  94. Give your answer as a new JSON with the "columns" and "pandas_equation_for_hypothesis" keys filled using the following format:
  95. ```json
  96. {
  97. "columns": [...],
  98. "pandas_equation_for_hypothesis": {...}
  99. }
  100. ```"""
  101. PROMPT_DER = """\
  102. Given a dataset topic, description, a true hypothesis that can be determined from the data, \
  103. and a target column from the dataset, generate a hypothesis for the target column using new independent columns not present in the existing columns.
  104. ```json
  105. {
  106. "topic": "%s",
  107. "description": "%s",
  108. "hypothesis": "%s",
  109. "existing_columns": %s,
  110. "target_column": "%s",
  111. "new_to_target_instructions": %s,
  112. "new_to_target_hypothesis": "...", # describe a relationship between new columns that explains the target column
  113. "new_columns_for_target": [ # do not repeat any of the existing columns in the dataset
  114. {
  115. "col_name": "...", # should be an "_"-separated string
  116. "description": "...",
  117. "data_type": "...", # should be executable using python's `eval` function. E.g., str, float, int, bool
  118. "data_range": {...}, # should be either {"min": ..., "max": ...} or {"values": [...]}
  119. },
  120. ...
  121. ],
  122. "pandas_instructions": %s,
  123. "pandas_equation_for_new_to_target_hypothesis": {
  124. "target_col": "...",
  125. "target_col_type": "...",
  126. "target_col_range": {...},
  127. "independent_cols_in_pandas_expression": [], # list of column names from new_columns_for_target that will be used to derive target_col
  128. "pandas_expression": "..." # expression to derive df[target_col] using df[ind_col1], df[ind_col2], etc.
  129. }
  130. }```
  131. Give your answer as a new JSON with the "new_to_target_hypothesis", "new_columns_for_target", and \
  132. "pandas_equation_for_new_to_target_hypothesis" keys filled using the following format:
  133. ```json
  134. {
  135. "new_to_target_hypothesis": "...",
  136. "new_columns_for_target": [...],
  137. "pandas_equation_for_new_to_target_hypothesis": {...}
  138. }
  139. ```"""