From 1023bb239639c1e34c158bca678f387524479b54 Mon Sep 17 00:00:00 2001 From: Becca McHenry Date: Fri, 20 May 2022 14:59:34 -0500 Subject: [PATCH] add source options --- machine-learning/DataSources.ipynb | 253 ++++++++++++++++++++++++ machine-learning/data/playerSalary.json | 18 ++ 2 files changed, 271 insertions(+) create mode 100644 machine-learning/DataSources.ipynb create mode 100644 machine-learning/data/playerSalary.json diff --git a/machine-learning/DataSources.ipynb b/machine-learning/DataSources.ipynb new file mode 100644 index 0000000..7dd0a8b --- /dev/null +++ b/machine-learning/DataSources.ipynb @@ -0,0 +1,253 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Loading data \n", + "\n", + "Getting data into the DataFrame is the most important step. The DataFrame itself supports [loading from a csv](https://docs.microsoft.com/en-us/dotnet/api/microsoft.data.analysis.dataframe.loadcsvfromstring?view=ml-dotnet-preview#microsoft-data-analysis-dataframe-loadcsvfromstring(system-string-system-char-system-boolean-system-string()-system-type()-system-int64-system-int32-system-boolean)). Not all data is already in a csv file. There is the option to convert from an IDataView into a DataFrame. ML.NET supports loading from a few different sources into an IDataView. See docs [here](https://docs.microsoft.com/en-us/dotnet/machine-learning/how-to-guides/load-data-ml-net). \n", + "\n", + "If you run into issue, please file them in our [Github repo](https://github.com/dotnet/machinelearning/issues). If possible, please include the problem data set. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "dotnet_interactive": { + "language": "csharp" + }, + "vscode": { + "languageId": "dotnet-interactive.csharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
Installed Packages
  • DataView.InteractiveExtension, 1.0.45
  • Microsoft.Data.Analysis, 0.19.1
  • Microsoft.ML, 1.7.1
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "Loading extensions from `DataView.InteractiveExtension.dll`" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "Loading extensions from `Microsoft.Data.Analysis.Interactive.dll`" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "Added support IDataView to kernel .NET." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "// load extension to get data frame api, visualization and formatting\n", + "\n", + "#r \"nuget: Microsoft.Data.Analysis, 0.19.1\"\n", + "#r \"nuget: DataView.InteractiveExtension, 1.0.45\"\n", + "#r \"nuget: Microsoft.ML\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Directly from CSV\n", + "We can easily load our data directly from a CSV into the DataFrame. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "dotnet_interactive": { + "language": "csharp" + }, + "vscode": { + "languageId": "dotnet-interactive.csharp" + } + }, + "outputs": [], + "source": [ + "var csvFilePath = @\"data/usa_hockey.csv\";" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "dotnet_interactive": { + "language": "csharp" + }, + "vscode": { + "languageId": "dotnet-interactive.csharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
indexBirthdayNatHeightWeightDraftYearOverallDraftHandLast NameFirst NamePositionTeamGamesPlayedGoalsAssistsPointsPIMShiftsTimeOnIce
0
88-16-04USA
72
218
2006
7
ROkposoKyleRWBUF
65
19
26
45
24
1443
73983
1
90-08-10USA
76
210
2009
114
LHelgesonSethDN.J
9
1
0
1
15
177
7273
2
96-26-11USA
77
203
2015
37
RCarloBrandonDBOS
82
6
10
16
59
2080
102414
3
90-16-11USA
74
219
<null>
<null>
LSchallerTimCBOS
59
7
7
14
23
1035
43436
4
92-20-03USA
72
215
2010
37
RFaulkJustinDCAR
75
17
20
37
32
1987
104133
5
94-01-05USA
74
205
2012
120
LSlavinJaccobDCAR
82
5
29
34
12
2135
115316
6
90-20-06USA
75
221
2008
128
RPaterynGregDDAL/MTL
36
1
8
9
10
720
33312
7
90-27-05USA
74
196
2009
198
RDowdNicCL.A
70
6
16
22
25
1230
52314
8
90-16-07USA
75
221
<null>
<null>
LLashoffBrianDDET
5
0
0
0
0
93
3754
9
86-09-08USA
71
197
<null>
<null>
RCannonePatrickCMIN
3
0
0
0
0
35
1419
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "using Microsoft.Data.Analysis;\n", + "\n", + "var csvDataFrame = DataFrame.LoadCsv(csvFilePath);\n", + "csvDataFrame" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ML.NET IDataView Loader\n", + "You may want to load from a different data source. ML.NET supports many different data souces, and we can convert an IDataView into a DataFrame. Find out more about IDataViews [here](https://github.com/dotnet/machinelearning/blob/main/docs/code/IDataViewDesignPrinciples.md). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "dotnet_interactive": { + "language": "csharp" + }, + "vscode": { + "languageId": "dotnet-interactive.csharp" + } + }, + "outputs": [], + "source": [ + "using Microsoft.ML.Data; \n", + "\n", + "public class SalaryData\n", + "{\n", + " [LoadColumn(0)]\n", + " public float Salary { get; set; }\n", + "\n", + " [LoadColumn(1)]\n", + " public string Name { get; set; }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### From File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "dotnet_interactive": { + "language": "csharp" + }, + "vscode": { + "languageId": "dotnet-interactive.csharp" + } + }, + "outputs": [], + "source": [ + "using Microsoft.ML;\n", + "using Microsoft.ML.Data;\n", + "using System;\n", + "using System.Collections.Generic;\n", + "using System.Linq;\n", + "\n", + "//Create MLContext\n", + "MLContext mlContext = new MLContext();\n", + "\n", + "//Load Data\n", + "IDataView data = mlContext.Data.LoadFromTextFile(\"data/playerSalary.csv\", separatorChar: ',', hasHeader: true);\n", + "var df = data.ToDataFrame();\n", + "\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### From JSON" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "dotnet_interactive": { + "language": "csharp" + }, + "vscode": { + "languageId": "dotnet-interactive.csharp" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
indexSalaryName
0
3000000
Adam Larsson
1
1600000
Andrej Sustr
2
2200000
Antoine Roussel
3
950000
Anton Rodin
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "using Newtonsoft.Json;\n", + "using System.IO;\n", + "\n", + "// Load the json file into an ennumerable, then into the data view from the ennumerable. \n", + "var accounts = JsonConvert.DeserializeObject>(File.ReadAllText(@\"data\\playerSalary.json\"));\n", + "IDataView dataView = mlContext.Data.LoadFromEnumerable(accounts);\n", + "\n", + "// Convert to DataFrame\n", + "var jsonDataFrame = dataView.ToDataFrame(); \n", + "\n", + "jsonDataFrame" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".NET (C#)", + "language": "C#", + "name": ".net-csharp" + }, + "language_info": { + "file_extension": ".cs", + "mimetype": "text/x-csharp", + "name": "C#", + "pygments_lexer": "csharp", + "version": "8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/machine-learning/data/playerSalary.json b/machine-learning/data/playerSalary.json new file mode 100644 index 0000000..876f978 --- /dev/null +++ b/machine-learning/data/playerSalary.json @@ -0,0 +1,18 @@ +[ +{ + "Salary": 3000000, + "Name": "Adam Larsson" +}, +{ + "Salary": 1600000, + "Name": "Andrej Sustr" +}, +{ + "Salary": 2200000, + "Name": "Antoine Roussel" +}, +{ + "Salary": 950000, + "Name": "Anton Rodin" +} +] \ No newline at end of file