-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
161 changed files
with
19,548 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
import pandas as pd | ||
from sklearn.preprocessing import MinMaxScaler | ||
|
||
|
||
def f_644(list_of_pairs): | ||
""" | ||
Create a Pandas DataFrame from a list of pairs and normalize the data using MinMaxScaler. | ||
Parameters: | ||
list_of_pairs (list): A list of tuples, where the first element is the category and | ||
the second element is the value. | ||
Returns: | ||
DataFrame: A pandas DataFrame containing the columns 'Category' and 'Value'. | ||
Category contains the the first elements of each tuple. | ||
Value contains the normalized values of each tuple. | ||
Raises: | ||
Exception: If the input array is empty. | ||
ValueError: If Values are not numeric. | ||
Requirements: | ||
- pandas | ||
- sklearn.preprocessing.MinMaxScaler | ||
Example: | ||
>>> list_of_pairs = [('Fruits', 5), ('Vegetables', 9), ('Dairy', -1), ('Bakery', -2), ('Meat', 4)] | ||
>>> df = f_644(list_of_pairs) | ||
>>> print(df) | ||
Category Value | ||
0 Fruits 0.636364 | ||
1 Vegetables 1.000000 | ||
2 Dairy 0.090909 | ||
3 Bakery 0.000000 | ||
4 Meat 0.545455 | ||
>>> list_of_pairs = [('car', 3.2), ('bike', 0), ('train', -1), ('plane', -6.2), ('ship', 1234)] | ||
>>> df = f_644(list_of_pairs) | ||
>>> print(df) | ||
Category Value | ||
0 car 0.007579 | ||
1 bike 0.004999 | ||
2 train 0.004193 | ||
3 plane 0.000000 | ||
4 ship 1.000000 | ||
""" | ||
|
||
if len(list_of_pairs) == 0: | ||
raise Exception('The input array should not be empty.') | ||
|
||
df = pd.DataFrame(list_of_pairs, columns=['Category', 'Value']) | ||
|
||
if pd.api.types.is_numeric_dtype(df.Value) is not True: | ||
raise ValueError('The values have to be numeric.') | ||
|
||
scaler = MinMaxScaler() | ||
df['Value'] = scaler.fit_transform(df[['Value']]) | ||
|
||
return df | ||
|
||
|
||
import unittest | ||
|
||
class TestCases(unittest.TestCase): | ||
def test_case_1(self): | ||
'''test with normal input data''' | ||
input_data = [('traditional', -4), ('we', 7), ('because', 3), ('ability', 10), ('exactly', -7)] | ||
result = f_644(input_data) | ||
self.assertTrue(isinstance(result, pd.DataFrame)) | ||
self.assertTrue('Value' in result.columns) | ||
self.assertAlmostEqual(result[result['Category'] == 'traditional']['Value'].item(), 0.176471, places=6) | ||
self.assertAlmostEqual(result[result['Category'] == 'we']['Value'].item(), 0.823529, places=6) | ||
self.assertAlmostEqual(result[result['Category'] == 'because']['Value'].item(), 0.588235, places=6) | ||
self.assertAlmostEqual(result[result['Category'] == 'ability']['Value'].item(), 1.000000, places=6) | ||
self.assertAlmostEqual(result[result['Category'] == 'exactly']['Value'].item(), 0.000000, places=6) | ||
|
||
|
||
def test_case_2(self): | ||
'''test empty input''' | ||
input_data = [] | ||
self.assertRaises(Exception, f_644, input_data) | ||
|
||
def test_case_3(self): | ||
'''non numeric values''' | ||
input_data = [('fast', 'test'), ('ago', -8), ('player', 7), ('standard', 2), ('specific', 0)] | ||
self.assertRaises(Exception, f_644, input_data) | ||
|
||
|
||
def test_case_4(self): | ||
'''Floating point values''' | ||
input_data = [('real', 4.453), ('others', -1.12), ('professor', -2.2), ('other', -5), ('task', -7.933)] | ||
result = f_644(input_data) | ||
self.assertTrue(isinstance(result, pd.DataFrame)) | ||
self.assertTrue('Value' in result.columns) | ||
self.assertAlmostEqual(result[result['Category'] == 'real']['Value'].item(), 1.000000, places=6) | ||
self.assertAlmostEqual(result[result['Category'] == 'others']['Value'].item(), 0.550057, places=6) | ||
self.assertAlmostEqual(result[result['Category'] == 'professor']['Value'].item(), 0.462861, places=6) | ||
self.assertAlmostEqual(result[result['Category'] == 'other']['Value'].item(), 0.236800, places=6) | ||
self.assertAlmostEqual(result[result['Category'] == 'task']['Value'].item(), 0.000000, places=6) | ||
|
||
|
||
def test_case_5(self): | ||
'''test for basic output structure''' | ||
input_data = [('visit', 4), ('brother', -2), ('experience', -10), ('whether', 8), ('hand', 3)] | ||
result = f_644(input_data) | ||
self.assertTrue(isinstance(result, pd.DataFrame)) | ||
self.assertTrue('Value' in result.columns) | ||
self.assertTrue('Category' in result.columns) | ||
self.assertTrue(0 <= result['Value'].min() <= 1) | ||
self.assertTrue(0 <= result['Value'].max() <= 1) | ||
|
||
def run_tests(): | ||
suite = unittest.TestSuite() | ||
suite.addTest(unittest.makeSuite(TestCases)) | ||
runner = unittest.TextTestRunner() | ||
runner.run(suite) | ||
|
||
if __name__ == "__main__": | ||
|
||
run_tests() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
import nltk | ||
from string import punctuation | ||
import pandas as pd | ||
|
||
|
||
def f_645(text): | ||
""" | ||
Finds all words in a text, that are seperated by whitespace, | ||
beginning with the "$" character and computes their number of occurences. | ||
Parameters: | ||
text (str): The input text. | ||
Returns: | ||
DataFrame: A pandas DataFrame with two columns: "Word" and "Frequency". | ||
"Word" contains the '$' prefixed words, and "Frequency" contains their occurrences. | ||
Raises: | ||
ValueError: if text is not a string | ||
Requirements: | ||
- nltk | ||
- string | ||
- pandas | ||
The function ignores words that are entirely made up of punctuation, even if they start with a '$'. | ||
Example: | ||
>>> text = "$abc def $efg $hij klm $ $abc $abc $hij $hij" | ||
>>> f_645(text) | ||
Word Frequency | ||
0 $abc 3 | ||
1 $efg 1 | ||
2 $hij 3 | ||
>>> text = "$hello this i$s a $test $test $test" | ||
>>> f_645(text) | ||
Word Frequency | ||
0 $hello 1 | ||
1 $test 3 | ||
""" | ||
if not isinstance(text, str): | ||
raise ValueError("The input should be a string.") | ||
|
||
tk = nltk.WhitespaceTokenizer() | ||
words = tk.tokenize(text) | ||
dollar_words = [word for word in words if word.startswith('$') and not all(c in set(punctuation) for c in word)] | ||
freq = nltk.FreqDist(dollar_words) | ||
df = pd.DataFrame(list(freq.items()), columns=["Word", "Frequency"]) | ||
return df | ||
|
||
import unittest | ||
|
||
|
||
class TestCases(unittest.TestCase): | ||
|
||
def test_case_1(self): | ||
text = "$abc def $efg $hij klm $ $abc $abc $hij $hij" | ||
result = f_645(text) | ||
expected_words = ["$abc", "$efg", "$hij"] | ||
expected_freqs = [3, 1, 3] | ||
self.assertListEqual(result["Word"].tolist(), expected_words) | ||
self.assertListEqual(result["Frequency"].tolist(), expected_freqs) | ||
|
||
def test_case_2(self): | ||
text = "This is a test without dollar words." | ||
result = f_645(text) | ||
self.assertEqual(len(result), 0) | ||
|
||
def test_case_3(self): | ||
text = "$test1 $test2 $test1 $test3" | ||
result = f_645(text) | ||
expected_words = ["$test1", "$test2", "$test3"] | ||
expected_freqs = [2, 1, 1] | ||
self.assertListEqual(result["Word"].tolist(), expected_words) | ||
self.assertListEqual(result["Frequency"].tolist(), expected_freqs) | ||
|
||
def test_case_4(self): | ||
text = "$! $$ $a $a $a" | ||
result = f_645(text) | ||
expected_words = ["$a"] | ||
expected_freqs = [3] | ||
self.assertListEqual(result["Word"].tolist(), expected_words) | ||
self.assertListEqual(result["Frequency"].tolist(), expected_freqs) | ||
|
||
def test_case_5(self): | ||
text = "$word1 word2 $word2 $word1 $word3 $word1" | ||
result = f_645(text) | ||
expected_words = ["$word1", "$word2", "$word3"] | ||
expected_freqs = [3, 1, 1] | ||
self.assertListEqual(result["Word"].tolist(), expected_words) | ||
self.assertListEqual(result["Frequency"].tolist(), expected_freqs) | ||
|
||
def test_case_6(self): | ||
'''empty input string''' | ||
text = "" | ||
result = f_645(text) | ||
expected_words = [] | ||
expected_freqs = [] | ||
self.assertListEqual(result["Word"].tolist(), expected_words) | ||
self.assertListEqual(result["Frequency"].tolist(), expected_freqs) | ||
|
||
def test_case_7(self): | ||
'''check for correct return type''' | ||
text = "$test 123 abcd.aef" | ||
result = f_645(text) | ||
self.assertTrue(isinstance(result, pd.DataFrame)) | ||
self.assertTrue('Word' in result.columns) | ||
self.assertTrue('Frequency' in result.columns) | ||
|
||
def test_case_8(self): | ||
'''word with $ in the middle''' | ||
text = "asdfj;alskdfj;$kjhkjhdf" | ||
result = f_645(text) | ||
expected_words = [] | ||
expected_freqs = [] | ||
self.assertListEqual(result["Word"].tolist(), expected_words) | ||
self.assertListEqual(result["Frequency"].tolist(), expected_freqs) | ||
|
||
def test_case_9(self): | ||
'''non string input''' | ||
input = 24 | ||
self.assertRaises(Exception, f_645, input) | ||
|
||
|
||
|
||
|
||
def run_tests(): | ||
suite = unittest.TestSuite() | ||
suite.addTest(unittest.makeSuite(TestCases)) | ||
runner = unittest.TextTestRunner() | ||
runner.run(suite) | ||
|
||
|
||
if __name__ == "__main__": | ||
run_tests() |
Oops, something went wrong.