Skip to content

Commit

Permalink
add simon data
Browse files Browse the repository at this point in the history
  • Loading branch information
terryyz committed Apr 13, 2024
1 parent 8ccc485 commit e3d2383
Show file tree
Hide file tree
Showing 161 changed files with 19,548 additions and 0 deletions.
119 changes: 119 additions & 0 deletions data/raw/f_644_simon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import pandas as pd
from sklearn.preprocessing import MinMaxScaler


def f_644(list_of_pairs):
"""
Create a Pandas DataFrame from a list of pairs and normalize the data using MinMaxScaler.
Parameters:
list_of_pairs (list): A list of tuples, where the first element is the category and
the second element is the value.
Returns:
DataFrame: A pandas DataFrame containing the columns 'Category' and 'Value'.
Category contains the the first elements of each tuple.
Value contains the normalized values of each tuple.
Raises:
Exception: If the input array is empty.
ValueError: If Values are not numeric.
Requirements:
- pandas
- sklearn.preprocessing.MinMaxScaler
Example:
>>> list_of_pairs = [('Fruits', 5), ('Vegetables', 9), ('Dairy', -1), ('Bakery', -2), ('Meat', 4)]
>>> df = f_644(list_of_pairs)
>>> print(df)
Category Value
0 Fruits 0.636364
1 Vegetables 1.000000
2 Dairy 0.090909
3 Bakery 0.000000
4 Meat 0.545455
>>> list_of_pairs = [('car', 3.2), ('bike', 0), ('train', -1), ('plane', -6.2), ('ship', 1234)]
>>> df = f_644(list_of_pairs)
>>> print(df)
Category Value
0 car 0.007579
1 bike 0.004999
2 train 0.004193
3 plane 0.000000
4 ship 1.000000
"""

if len(list_of_pairs) == 0:
raise Exception('The input array should not be empty.')

df = pd.DataFrame(list_of_pairs, columns=['Category', 'Value'])

if pd.api.types.is_numeric_dtype(df.Value) is not True:
raise ValueError('The values have to be numeric.')

scaler = MinMaxScaler()
df['Value'] = scaler.fit_transform(df[['Value']])

return df


import unittest

class TestCases(unittest.TestCase):
def test_case_1(self):
'''test with normal input data'''
input_data = [('traditional', -4), ('we', 7), ('because', 3), ('ability', 10), ('exactly', -7)]
result = f_644(input_data)
self.assertTrue(isinstance(result, pd.DataFrame))
self.assertTrue('Value' in result.columns)
self.assertAlmostEqual(result[result['Category'] == 'traditional']['Value'].item(), 0.176471, places=6)
self.assertAlmostEqual(result[result['Category'] == 'we']['Value'].item(), 0.823529, places=6)
self.assertAlmostEqual(result[result['Category'] == 'because']['Value'].item(), 0.588235, places=6)
self.assertAlmostEqual(result[result['Category'] == 'ability']['Value'].item(), 1.000000, places=6)
self.assertAlmostEqual(result[result['Category'] == 'exactly']['Value'].item(), 0.000000, places=6)


def test_case_2(self):
'''test empty input'''
input_data = []
self.assertRaises(Exception, f_644, input_data)

def test_case_3(self):
'''non numeric values'''
input_data = [('fast', 'test'), ('ago', -8), ('player', 7), ('standard', 2), ('specific', 0)]
self.assertRaises(Exception, f_644, input_data)


def test_case_4(self):
'''Floating point values'''
input_data = [('real', 4.453), ('others', -1.12), ('professor', -2.2), ('other', -5), ('task', -7.933)]
result = f_644(input_data)
self.assertTrue(isinstance(result, pd.DataFrame))
self.assertTrue('Value' in result.columns)
self.assertAlmostEqual(result[result['Category'] == 'real']['Value'].item(), 1.000000, places=6)
self.assertAlmostEqual(result[result['Category'] == 'others']['Value'].item(), 0.550057, places=6)
self.assertAlmostEqual(result[result['Category'] == 'professor']['Value'].item(), 0.462861, places=6)
self.assertAlmostEqual(result[result['Category'] == 'other']['Value'].item(), 0.236800, places=6)
self.assertAlmostEqual(result[result['Category'] == 'task']['Value'].item(), 0.000000, places=6)


def test_case_5(self):
'''test for basic output structure'''
input_data = [('visit', 4), ('brother', -2), ('experience', -10), ('whether', 8), ('hand', 3)]
result = f_644(input_data)
self.assertTrue(isinstance(result, pd.DataFrame))
self.assertTrue('Value' in result.columns)
self.assertTrue('Category' in result.columns)
self.assertTrue(0 <= result['Value'].min() <= 1)
self.assertTrue(0 <= result['Value'].max() <= 1)

def run_tests():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(TestCases))
runner = unittest.TextTestRunner()
runner.run(suite)

if __name__ == "__main__":

run_tests()
137 changes: 137 additions & 0 deletions data/raw/f_645_simon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import nltk
from string import punctuation
import pandas as pd


def f_645(text):
"""
Finds all words in a text, that are seperated by whitespace,
beginning with the "$" character and computes their number of occurences.
Parameters:
text (str): The input text.
Returns:
DataFrame: A pandas DataFrame with two columns: "Word" and "Frequency".
"Word" contains the '$' prefixed words, and "Frequency" contains their occurrences.
Raises:
ValueError: if text is not a string
Requirements:
- nltk
- string
- pandas
The function ignores words that are entirely made up of punctuation, even if they start with a '$'.
Example:
>>> text = "$abc def $efg $hij klm $ $abc $abc $hij $hij"
>>> f_645(text)
Word Frequency
0 $abc 3
1 $efg 1
2 $hij 3
>>> text = "$hello this i$s a $test $test $test"
>>> f_645(text)
Word Frequency
0 $hello 1
1 $test 3
"""
if not isinstance(text, str):
raise ValueError("The input should be a string.")

tk = nltk.WhitespaceTokenizer()
words = tk.tokenize(text)
dollar_words = [word for word in words if word.startswith('$') and not all(c in set(punctuation) for c in word)]
freq = nltk.FreqDist(dollar_words)
df = pd.DataFrame(list(freq.items()), columns=["Word", "Frequency"])
return df

import unittest


class TestCases(unittest.TestCase):

def test_case_1(self):
text = "$abc def $efg $hij klm $ $abc $abc $hij $hij"
result = f_645(text)
expected_words = ["$abc", "$efg", "$hij"]
expected_freqs = [3, 1, 3]
self.assertListEqual(result["Word"].tolist(), expected_words)
self.assertListEqual(result["Frequency"].tolist(), expected_freqs)

def test_case_2(self):
text = "This is a test without dollar words."
result = f_645(text)
self.assertEqual(len(result), 0)

def test_case_3(self):
text = "$test1 $test2 $test1 $test3"
result = f_645(text)
expected_words = ["$test1", "$test2", "$test3"]
expected_freqs = [2, 1, 1]
self.assertListEqual(result["Word"].tolist(), expected_words)
self.assertListEqual(result["Frequency"].tolist(), expected_freqs)

def test_case_4(self):
text = "$! $$ $a $a $a"
result = f_645(text)
expected_words = ["$a"]
expected_freqs = [3]
self.assertListEqual(result["Word"].tolist(), expected_words)
self.assertListEqual(result["Frequency"].tolist(), expected_freqs)

def test_case_5(self):
text = "$word1 word2 $word2 $word1 $word3 $word1"
result = f_645(text)
expected_words = ["$word1", "$word2", "$word3"]
expected_freqs = [3, 1, 1]
self.assertListEqual(result["Word"].tolist(), expected_words)
self.assertListEqual(result["Frequency"].tolist(), expected_freqs)

def test_case_6(self):
'''empty input string'''
text = ""
result = f_645(text)
expected_words = []
expected_freqs = []
self.assertListEqual(result["Word"].tolist(), expected_words)
self.assertListEqual(result["Frequency"].tolist(), expected_freqs)

def test_case_7(self):
'''check for correct return type'''
text = "$test 123 abcd.aef"
result = f_645(text)
self.assertTrue(isinstance(result, pd.DataFrame))
self.assertTrue('Word' in result.columns)
self.assertTrue('Frequency' in result.columns)

def test_case_8(self):
'''word with $ in the middle'''
text = "asdfj;alskdfj;$kjhkjhdf"
result = f_645(text)
expected_words = []
expected_freqs = []
self.assertListEqual(result["Word"].tolist(), expected_words)
self.assertListEqual(result["Frequency"].tolist(), expected_freqs)

def test_case_9(self):
'''non string input'''
input = 24
self.assertRaises(Exception, f_645, input)




def run_tests():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(TestCases))
runner = unittest.TextTestRunner()
runner.run(suite)


if __name__ == "__main__":
run_tests()
Loading

0 comments on commit e3d2383

Please sign in to comment.