diff --git a/img/ccnn.pdf b/img/ccnn.pdf
new file mode 100644
index 0000000..f598ec5
Binary files /dev/null and b/img/ccnn.pdf differ
diff --git a/img/cluster-features_fav.pdf b/img/cluster-features_fav.pdf
new file mode 100644
index 0000000..ac09091
Binary files /dev/null and b/img/cluster-features_fav.pdf differ
diff --git a/img/cluster-features_orig.pdf b/img/cluster-features_orig.pdf
new file mode 100644
index 0000000..94ec0ed
Binary files /dev/null and b/img/cluster-features_orig.pdf differ
diff --git a/img/conv_nn_learning_curve.pdf b/img/conv_nn_learning_curve.pdf
new file mode 100644
index 0000000..7c73613
Binary files /dev/null and b/img/conv_nn_learning_curve.pdf differ
diff --git a/img/corr-matrix_fav.pdf b/img/corr-matrix_fav.pdf
new file mode 100644
index 0000000..8c88bbb
Binary files /dev/null and b/img/corr-matrix_fav.pdf differ
diff --git a/img/corr-matrix_orig.pdf b/img/corr-matrix_orig.pdf
new file mode 100644
index 0000000..6c2d163
Binary files /dev/null and b/img/corr-matrix_orig.pdf differ
diff --git a/img/distr-labels-corr-feat_fav.pdf b/img/distr-labels-corr-feat_fav.pdf
new file mode 100644
index 0000000..ad250b0
Binary files /dev/null and b/img/distr-labels-corr-feat_fav.pdf differ
diff --git a/img/distr-labels-corr-feat_orig.pdf b/img/distr-labels-corr-feat_orig.pdf
new file mode 100644
index 0000000..1b18fb4
Binary files /dev/null and b/img/distr-labels-corr-feat_orig.pdf differ
diff --git a/img/errors_icnn_h11_orig.pdf b/img/errors_icnn_h11_orig.pdf
new file mode 100644
index 0000000..e5be537
Binary files /dev/null and b/img/errors_icnn_h11_orig.pdf differ
diff --git a/img/errors_icnn_h21_orig.pdf b/img/errors_icnn_h21_orig.pdf
new file mode 100644
index 0000000..b5a0e43
Binary files /dev/null and b/img/errors_icnn_h21_orig.pdf differ
diff --git a/img/fc.pdf b/img/fc.pdf
new file mode 100644
index 0000000..77092d8
Binary files /dev/null and b/img/fc.pdf differ
diff --git a/img/forest_learning_curve_all_outliers.pdf b/img/forest_learning_curve_all_outliers.pdf
new file mode 100644
index 0000000..7857f75
Binary files /dev/null and b/img/forest_learning_curve_all_outliers.pdf differ
diff --git a/img/forest_learning_curve_matrix_outliers.pdf b/img/forest_learning_curve_matrix_outliers.pdf
new file mode 100644
index 0000000..18a5e7e
Binary files /dev/null and b/img/forest_learning_curve_matrix_outliers.pdf differ
diff --git a/img/grd_bst_orig.pdf b/img/grd_bst_orig.pdf
new file mode 100644
index 0000000..e0c6fac
Binary files /dev/null and b/img/grd_bst_orig.pdf differ
diff --git a/img/icnn.pdf b/img/icnn.pdf
new file mode 100644
index 0000000..026afa6
Binary files /dev/null and b/img/icnn.pdf differ
diff --git a/img/inc_nn_learning_curve.pdf b/img/inc_nn_learning_curve.pdf
new file mode 100644
index 0000000..67c8e09
Binary files /dev/null and b/img/inc_nn_learning_curve.pdf differ
diff --git a/img/inc_nn_learning_curve_h11.pdf b/img/inc_nn_learning_curve_h11.pdf
new file mode 100644
index 0000000..954a633
Binary files /dev/null and b/img/inc_nn_learning_curve_h11.pdf differ
diff --git a/img/label-distribution-compare_orig.pdf b/img/label-distribution-compare_orig.pdf
new file mode 100644
index 0000000..c4f28c3
Binary files /dev/null and b/img/label-distribution-compare_orig.pdf differ
diff --git a/img/lin_svr_orig.pdf b/img/lin_svr_orig.pdf
new file mode 100644
index 0000000..9d8cf15
Binary files /dev/null and b/img/lin_svr_orig.pdf differ
diff --git a/img/linreg_learning_curve_matrix_outliers.pdf b/img/linreg_learning_curve_matrix_outliers.pdf
new file mode 100644
index 0000000..fcca35d
Binary files /dev/null and b/img/linreg_learning_curve_matrix_outliers.pdf differ
diff --git a/img/linreg_learning_curve_num_cp_fav.pdf b/img/linreg_learning_curve_num_cp_fav.pdf
new file mode 100644
index 0000000..00926bc
Binary files /dev/null and b/img/linreg_learning_curve_num_cp_fav.pdf differ
diff --git a/img/linreg_learning_curve_num_cp_outliers.pdf b/img/linreg_learning_curve_num_cp_outliers.pdf
new file mode 100644
index 0000000..9577f29
Binary files /dev/null and b/img/linreg_learning_curve_num_cp_outliers.pdf differ
diff --git a/img/loss-lr_ccnn_h11_orig.pdf b/img/loss-lr_ccnn_h11_orig.pdf
new file mode 100644
index 0000000..3170711
Binary files /dev/null and b/img/loss-lr_ccnn_h11_orig.pdf differ
diff --git a/img/loss-lr_ccnn_h21_orig.pdf b/img/loss-lr_ccnn_h21_orig.pdf
new file mode 100644
index 0000000..83a6665
Binary files /dev/null and b/img/loss-lr_ccnn_h21_orig.pdf differ
diff --git a/img/loss-lr_fc_orig.pdf b/img/loss-lr_fc_orig.pdf
new file mode 100644
index 0000000..81c6bb5
Binary files /dev/null and b/img/loss-lr_fc_orig.pdf differ
diff --git a/img/loss-lr_icnn_h11_orig.pdf b/img/loss-lr_icnn_h11_orig.pdf
new file mode 100644
index 0000000..6caeef3
Binary files /dev/null and b/img/loss-lr_icnn_h11_orig.pdf differ
diff --git a/img/loss-lr_icnn_h21_orig.pdf b/img/loss-lr_icnn_h21_orig.pdf
new file mode 100644
index 0000000..8e2a64e
Binary files /dev/null and b/img/loss-lr_icnn_h21_orig.pdf differ
diff --git a/img/lss_reg_orig.pdf b/img/lss_reg_orig.pdf
new file mode 100644
index 0000000..05c13be
Binary files /dev/null and b/img/lss_reg_orig.pdf differ
diff --git a/img/rnd_for_orig.pdf b/img/rnd_for_orig.pdf
new file mode 100644
index 0000000..47ab86d
Binary files /dev/null and b/img/rnd_for_orig.pdf differ
diff --git a/img/scalar-features_fav.pdf b/img/scalar-features_fav.pdf
new file mode 100644
index 0000000..9a6c134
Binary files /dev/null and b/img/scalar-features_fav.pdf differ
diff --git a/img/scalar-features_orig.pdf b/img/scalar-features_orig.pdf
new file mode 100644
index 0000000..bbe3ee4
Binary files /dev/null and b/img/scalar-features_orig.pdf differ
diff --git a/img/stacking.pdf b/img/stacking.pdf
new file mode 100644
index 0000000..ba14941
Binary files /dev/null and b/img/stacking.pdf differ
diff --git a/img/svd_fav.pdf b/img/svd_fav.pdf
new file mode 100644
index 0000000..820190b
Binary files /dev/null and b/img/svd_fav.pdf differ
diff --git a/img/svd_orig.pdf b/img/svd_orig.pdf
new file mode 100644
index 0000000..bd53add
Binary files /dev/null and b/img/svd_orig.pdf differ
diff --git a/img/svm_learning_curve_all_outliers.pdf b/img/svm_learning_curve_all_outliers.pdf
new file mode 100644
index 0000000..675ac7f
Binary files /dev/null and b/img/svm_learning_curve_all_outliers.pdf differ
diff --git a/img/svm_learning_curve_matrix_fav.pdf b/img/svm_learning_curve_matrix_fav.pdf
new file mode 100644
index 0000000..65a38e7
Binary files /dev/null and b/img/svm_learning_curve_matrix_fav.pdf differ
diff --git a/img/svm_learning_curve_matrix_outliers.pdf b/img/svm_learning_curve_matrix_outliers.pdf
new file mode 100644
index 0000000..591c66c
Binary files /dev/null and b/img/svm_learning_curve_matrix_outliers.pdf differ
diff --git a/img/svm_learning_curve_num_cp.pdf b/img/svm_learning_curve_num_cp.pdf
new file mode 100644
index 0000000..672f235
Binary files /dev/null and b/img/svm_learning_curve_num_cp.pdf differ
diff --git a/img/svm_learning_curve_num_cp_fav.pdf b/img/svm_learning_curve_num_cp_fav.pdf
new file mode 100644
index 0000000..83ce0b2
Binary files /dev/null and b/img/svm_learning_curve_num_cp_fav.pdf differ
diff --git a/img/svr_rbf_orig.pdf b/img/svr_rbf_orig.pdf
new file mode 100644
index 0000000..5a8a3a4
Binary files /dev/null and b/img/svr_rbf_orig.pdf differ
diff --git a/img/vector-tensor-features_fav.pdf b/img/vector-tensor-features_fav.pdf
new file mode 100644
index 0000000..f3cde2f
Binary files /dev/null and b/img/vector-tensor-features_fav.pdf differ
diff --git a/img/vector-tensor-features_orig.pdf b/img/vector-tensor-features_orig.pdf
new file mode 100644
index 0000000..2558655
Binary files /dev/null and b/img/vector-tensor-features_orig.pdf differ
diff --git a/sciencestuff.sty b/sciencestuff.sty
index a2d8554..2e944fa 100644
--- a/sciencestuff.sty
+++ b/sciencestuff.sty
@@ -16,7 +16,7 @@
 \RequirePackage{dsfont} %---------------------- improved math set symbols
 \RequirePackage{upgreek} %--------------------- better Greek alphabet
 \RequirePackage{physics} %--------------------- full physics-related commands
-\RequirePackage{siunitx} %--------------------- SI units formatting
+\RequirePackage[binary-units=true]{siunitx} %-- SI units formatting
 \RequirePackage{graphicx} %-------------------- images and figures
 \RequirePackage{ifthen} %---------------------- conditionals
 \RequirePackage[sort&compress,
@@ -61,6 +61,8 @@
 \providecommand{\cy}{\textsc{CY}\xspace}
 \providecommand{\lhs}{\textsc{lhs}\xspace}
 \providecommand{\rhs}{\textsc{rhs}\xspace}
+\providecommand{\mse}{\textsc{mse}\xspace}
+\providecommand{\mae}{\textsc{mae}\xspace}
 \providecommand{\ap}{\ensuremath{\alpha'}\xspace}
 \providecommand{\sgn}{\ensuremath{\mathrm{sign}}}
 
diff --git a/sec/part3/introduction.tex b/sec/part3/introduction.tex
index 96d90c6..0310e88 100644
--- a/sec/part3/introduction.tex
+++ b/sec/part3/introduction.tex
@@ -92,31 +92,32 @@ The simplest CYs are constructed by considering the complete intersection of hyp
 Such hypersurfaces are defined by homogeneous polynomial equations: a Calabi--Yau $X$ is described by the solution to the system of equations, i.e.\ by the intersection of all these surfaces.
 The intersection is ``complete'' in the sense that the hypersurface is non-degenerate.
 
-%%% TODO %%%
-
 To gain some intuition, consider the case of a single projective space $\mathds{P}^n$ with (homogeneous) coordinates $Z^I$, $I = 0, \ldots, n$.
-In this case, a codimension $1$ subspace is obtained by imposing a single homogeneous polynomial equation of degree $a$ on the coordinates
+A codimension $1$ subspace is obtained by imposing a single homogeneous polynomial equation of degree $a$ on the coordinates:
 \begin{equation}
-  \begin{gathered}
-    p_a(Z^0, \ldots, Z^n)
-            = P_{I_1 \cdots I_a} Z^{I_1} \cdots Z^{I_a}
-            = 0,
+  \begin{split}
+    p_a\qty(Z^0,\, \dots,\, Z^n)
+    & =
+    P_{I_1 \dots I_a}\, Z^{I_1} \dots Z^{I_a}
+    = 0,
     \\
-    p_a(\lambda Z^0, \ldots, \lambda Z^n) = \lambda^a \, p_a(Z^0, \ldots, Z^n).
-  \end{gathered}
+    p_a\qty(\lambda Z^0,\, \dots,\, \lambda Z^n)
+    & =
+    \lambda^a \, p_a\qty(Z^0,\, \dots,\, Z^n).
+  \end{split}
 \end{equation}
-Each choice of the polynomial coefficients $P_{I_1 \cdots I_a}$ leads to a different manifold.
-However, it can be shown that the manifolds are (generically) topologically equivalent.
-Since we are interested only in classifying the CY as topological manifolds and not as complex manifolds, the information about $P_{I_1 \cdots I_a}$ can be forgotten and it is sufficient to keep track only on the dimension $n$ of the projective space and of the degree $a$ of the equation.
-The resulting hypersurface is denoted equivalently as $[\mathds{P}^n \mid a] = [n \mid a]$.
-Finally, $[\mathds{P}^n \mid a]$ is $3$-dimensional if $n = 4$ (the equation reduces the dimension by one), and it is a CY (the “quintic”) if $a = n + 1 = 5$ (this is required for the vanishing of its first Chern class).
-The simplest representative of this class if Fermat's quintic defined by the equation
+Each choice of the polynomial coefficients $P_{I_1 \dots I_a}$ leads to a different manifold.
+However it can be shown that the manifolds are in general topologically equivalent.
+Since we are interested only in classifying the \cy as topological manifolds and not as complex manifolds, the information on $P_{I_1 \dots I_a}$ can be discarded and it is sufficient to keep track only of the dimension $n$ of the projective space and of the degree $a$ of the equation.
+The resulting hypersurface is denoted equivalently as $\qty[\mathds{P}^n \mid a] = \qty[n \mid a]$.
+Notice that $\qty[\mathds{P}^n \mid a]$ is $3$-dimensional if $n = 4$ (the equation reduces the dimension by one), and it is a \cy (the ``quintic'') if $a = n + 1 = 5$ (this is required for the vanishing of its first Chern class).
+The simplest representative of this class if Fermat's quintic defined by the equation:
 \begin{equation}
   \finitesum{I}{0}{4} \qty( Z^I )^5 = 0.
 \end{equation}
 
-This construction can be generalized to include $m$ projective spaces and $k$ equations, which can mix the coordinates of the different spaces.
-A CICY $3$-fold $X$ as a topological manifold is completely specified by a \emph{configuration matrix} denoted by the same symbol as the manifold:
+This construction can be generalized to include $m$ projective spaces and $k$ equations which can mix the coordinates of the different spaces.
+A \cicy $3$-fold $X$ as a topological manifold is completely specified by a \emph{configuration matrix} denoted by the same symbol as the manifold:
 \begin{equation}
   X =
   \left[
@@ -138,74 +139,56 @@ where the coefficients $a^r_{\alpha}$ are positive integers and satisfy the foll
   \forall r \in \qty{1,\, 2,\, \dots,\, m}.
   \label{eq:cicy-constraints}
 \end{equation}
-The first relation states that the dimension of the ambient space minus the number of equations equals the dimension of the CY $3$-fold.
-The second set of constraints arise from the vanishing of its first Chern class; they imply that the $n_i$ can be recovered from the matrix elements.
-
-In this case also, two manifolds described by the same configuration matrix but different polynomials are equivalent as real manifold (they are diffeomorphic) -- and thus as topological manifolds --, but they are different as complex manifolds.
-Hence, it makes sense to write only the configuration matrix.
+The first relation states that the difference between the dimension of the ambient space and the number of equations is the dimension of the \cy $3$-fold.
+The second set of constraints arises from the vanishing of its first Chern class.
+It implies that the $n_i$ can be recovered from the matrix elements.
+Two manifolds described by the same configuration matrix but different polynomials are diffeomorphic as real manifold, and thus as topological manifolds, but they are different as complex manifolds.
+Hence it makes sense to write only the configuration matrix.
 
 A given topological manifold is not described by a unique configuration matrix.
-First, any permutation of the lines and columns leave the intersection unchanged (it amounts to relabelling the projective spaces and equations).
+First, any permutation of the lines and columns leave the intersection unchanged as it amounts to relabelling the projective spaces and equations.
 Secondly, two intersections can define the same manifold.
 The ambiguity in the line and column permutations is often fixed by imposing some ordering of the coefficients.
-Moreover, in most cases, there is an optimal representation of the manifold $X$, called favourable~\cite{Anderson:2017:FibrationsCICYThreefolds}: in such a form, topological properties of $X$ can be more easily derived from the ambient space $\cA$.
+Moreover there is an optimal representation of the manifold $X$, called \emph{favourable}~\cite{Anderson:2017:FibrationsCICYThreefolds}: in such form topological properties of $X$ can be more conveniently derived from the ambient space $\cA$.
+Finally, simple arguments~\cite{Green:1987:CalabiYauManifoldsComplete, Candelas:1988:CompleteIntersectionCalabiYau, Lutken:1988:RecentProgressCalabiYauology} show that the number of \cicy is necessarily finite due to the constraints~\eqref{eq:cicy-constraints} together with identities between complete intersection manifolds.
 
 
 \subsection{Datasets}
 \label{sec:data:datasets}
 
-
-Simple arguments~\cite{Green:1987:CalabiYauManifoldsComplete, Candelas:1988:CompleteIntersectionCalabiYau, Lutken:1988:RecentProgressCalabiYauology} show that the number of CICY is necessarily finite due to the constraints \eqref{eq:cicy-constraints} together with identities between complete intersection manifolds.
-The classification of the CICY $3$-folds has been tackled in~\cite{Candelas:1988:CompleteIntersectionCalabiYau}, which established a dataset of $7890$ CICY.\footnotemark{}
-\footnotetext{%
-	However, there are redundancies in this set~\cite{Candelas:1988:CompleteIntersectionCalabiYau, Anderson:2008:MonadBundlesHeterotic, Anderson:2017:FibrationsCICYThreefolds}; this fact will be ignored in this paper.
-}%
+The classification of the \cicy $3$-folds has been tackled in~\cite{Candelas:1988:CompleteIntersectionCalabiYau}.
+The analysis established a dataset of $7890$ \cicy.
 The topological properties of each of these manifolds have been computed in~\cite{Green:1989:AllHodgeNumbers}.
-More recently, a new classification has been performed~\cite{Anderson:2017:FibrationsCICYThreefolds} in order to find the favourable representation of each manifold whenever it is possible.
+More recently a new classification has been performed~\cite{Anderson:2017:FibrationsCICYThreefolds} in order to find the favourable representation of each manifold whenever it is possible.
 
-Below we show a list of the CICY properties and of their configuration matrices:
+Below we show a list of the \cicy properties and of their configuration matrices:
 \begin{itemize}
-  \item general properties
+  \item general properties:
   \begin{itemize}
     \item number of configurations: $7890$
-
     \item number of product spaces (block diagonal matrix): $22$
-
-    \item $h^{11} \in [0, 19]$, $18$ distinct values (\Cref{fig:data:hist-h11})
-
-    \item $h^{21} \in [0, 101]$, $65$ distinct values (\Cref{fig:data:hist-h21})
-
-
+    \item $h^{11} \in [0, 19]$ with $18$ distinct values (\Cref{fig:data:hist-h11})
+    \item $h^{21} \in [0, 101]$ with $65$ distinct values (\Cref{fig:data:hist-h21})
     \item unique Hodge number combinations: $266$
   \end{itemize}
 
-  \item “original dataset”~\cite{Candelas:1988:CompleteIntersectionCalabiYau, Green:1989:AllHodgeNumbers}
-
+  \item ``original dataset''~\cite{Candelas:1988:CompleteIntersectionCalabiYau, Green:1989:AllHodgeNumbers}
   \begin{itemize}
     \item maximal size of the configuration matrices: $12 \times 15$
-
     \item number of favourable matrices (excluding product spaces): $4874$ ($\num{61.8}\%$)
-
     \item number of non-favourable matrices (excluding product spaces): $2994$
-
     \item number of different ambient spaces: $235$
   \end{itemize}
 
-
-  \item “favourable dataset”~\cite{Anderson:2017:FibrationsCICYThreefolds}
-
+  \item ``favourable dataset''~\cite{Anderson:2017:FibrationsCICYThreefolds}
   \begin{itemize}
     \item maximal size of the configuration matrices: $15 \times 18$
-
     \item number of favourable matrices (excluding product spaces): $7820$ ($\num{99.1}\%$)
-
     \item number of non-favourable matrices (excluding product spaces): $48$
-
     \item number of different ambient spaces: $126$
   \end{itemize}
 \end{itemize}
 
-
 \begin{figure}[tbp]
   \centering
   \begin{subfigure}[c]{.45\linewidth}
@@ -225,18 +208,12 @@ Below we show a list of the CICY properties and of their configuration matrices:
   \label{fig:data:hist-hodge}
 \end{figure}
 
-
-The configuration matrix completely encodes the information of the CICY and all topological quantities can be derived from it.
-However, the computations are involved and there is often no closed-form expression.
-This situation is typical in algebraic geometry, and it can be even worse for some problems, in the sense that it is not even known how to compute the desired quantity (think to the metric of CYs).
-For these reasons, it is interesting to study how we can retrieve these properties using ML algorithms.
-In the current paper, following~\cite{He:2017:MachinelearningStringLandscape, Bull:2018:MachineLearningCICY}, we focus on the computation of the Hodge numbers with the initial scheme:
-\begin{equation}
-  \text{Input: configuration matrix}
-  \quad \longrightarrow \quad
-  \text{Output: Hodge numbers}
-\end{equation}
-To provide a good test case for the use of ML in context where the mathematical theory is not completely understood, we will make no use of known formulas.
+The configuration matrix completely encodes the information of the \cicy and all topological quantities can be derived from it.
+However the computations are involved and there is often no closed-form expression.
+This situation is typical in algebraic geometry and it can be even worse for some problems, in the sense that it is not even known how to compute the desired quantity (e.g. the metric of \cy manifolds).
+For these reasons it is interesting to study how to retrieve these properties using \ml algorithms.
+In what follows we focus on the prediction of the Hodge numbers.
+To provide a good test case for the use of \ml in context where the mathematical theory is not completely understood, we make no use of known formulas.
 
 
 % vim: ft=tex
diff --git a/sec/part3/ml.tex b/sec/part3/ml.tex
new file mode 100644
index 0000000..db6f5dd
--- /dev/null
+++ b/sec/part3/ml.tex
@@ -0,0 +1,1461 @@
+In the following sections we present the preliminary analysis and the machine and deep learning study applied to the predictions of the Hodge numbers of \cicy $3$-folds.
+We use both a ``classical'' approach to \ml using \emph{shallow learning} algorithm using geometrical methods to find new representations of the data and more moder approaches based on \emph{computer vision} and recent developments in computer science techniques.
+We show how \emph{deep learning} the geometry of string theory can help in providing good computational tools for phenomenology and algebraic geometry.
+We also stress future investigations which can be performed based on these results.
+
+
+\subsection{Exploratory Data Analysis}
+\label{sec:data:eda}
+
+A typical \ml project does not consist of feeding the raw data to the algorithm.
+It is instead preceded by a phase of exploration in order to better understand the data, which in turn can help to design the learning algorithms.
+We call \emph{features} properties given as inputs, and \emph{labels} the targets of the predictions.
+There are several phases in the exploratory data analysis (\eda)~\cite{Skiena:2017:DataScienceDesign}:
+\begin{enumerate}
+  \item \emph{feature engineering}: new features are derived from the inputs;
+
+  \item \emph{feature selection}: the most relevant features are chosen to explain the targets;
+
+  \item \emph{data augmentation}: new training data is generated from the existing ones;
+
+  \item \emph{data diminution}: part of the training data is not used.
+\end{enumerate}
+
+Engineered features are redundant by definition but they can help the algorithm learn more efficiently by providing an alternative formulation and by drawing attention on salient characteristics.
+A simple example is the following: given a series of numbers, one can compute different statistics, such as median, mean and variance, and add them to the inputs.
+It may happen that the initial series becomes then irrelevant once this new information is introduced.
+Another approach to improve the learning process is to augment or decrease the number of training samples artificially.\footnotemark{}
+\footnotetext{%
+  This is in general used in computer vision and object detection tasks where providing rotated, scaled and cropped versions of the input images can help the algorithms in learning more representations of the same object, thus creating more accurate predictions.
+}
+For example we could use invariances of the inputs to generate more training data.
+This however does not help in our case because the entries of the configuration matrices are partially ordered.
+Another possibility is to remove outliers which can damage the learning process by driving the algorithm far from the best solution.
+If there are few of them it is better to ignore them altogether during training since an algorithm which is not robust to outliers will in any case make bad predictions (a standard illustration is given by the Pearson and Spearman correlation coefficients, with the first not being robust to outliers~\cite{Skiena:2017:DataScienceDesign}).
+
+Before starting the \eda, the first step should be to split the data into training and validation sets to avoid biasing the choices of the algorithm and the strategy: the \eda should be performed only on the training set.
+However the dataset we consider is complete and quite uniform: a subset of it would display the same characteristics as the entire set.\footnotemark{}
+\footnotetext{%
+  A dataset is \emph{tidy} if every column represents a separate variable and every row is a different observation.
+  For instance every row could represent a date expressed in seconds from a reference instant and every column could be a separate sensor reading.
+  However the ``transposed'' version of the dataset is not a tidy dataset because the observations are in the columns, thus representing a non standard form of the data.
+  A tidy dataset is \emph{complete} if there are no empty cells, that is there is no lack of data or information in the entire set.
+  A \emph{uniform} dataset can be understood as a complete dataset in which every variable is well distributed and does not present a lot of outliers or anomalies.
+}
+To give a general overview of the properties we work with the full dataset.
+
+
+\subsubsection{Engineering}
+
+\begin{figure}[tbp]
+  \centering
+  \begin{subfigure}[c]{.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth, trim={6in 0 0 0}, clip]{img/svd_orig}
+    \caption{original dataset}
+  \end{subfigure}
+  \hfill
+  \begin{subfigure}[c]{.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth, trim={6in 0 0 0}, clip]{img/svd_fav}
+    \caption{favourable dataset}
+  \end{subfigure}
+  \caption{%
+    Cumulative retained variance of the principal components of the configuration matrix.
+  }
+  \label{fig:eda:svd}
+\end{figure}
+
+Any transformation of the input data which has some mathematical meaning can be a useful feature.
+We establish the following list of useful quantities (most of them are already used to characterise \cicy manifolds in the literature~\cite{Hubsch:1992:CalabiyauManifoldsBestiary}):
+\begin{itemize}
+  \item the number of projective spaces (rows), $m = $ \texttt{num\_cp};
+
+  \item the number of equations (columns), $k = $ \texttt{num\_eqs};
+
+  \item the number of $\mathds{P}^1$, $f = $ \texttt{num\_cp\_1};
+
+  \item the number of $\mathds{P}^2$, \texttt{num\_cp\_2};
+
+  \item the number of $\mathds{P}^n$ with $n \neq 1$, $F = $ \texttt{num\_cp\_neq1};
+
+  \item the excess number $N_{ex} = \finitesum{r}{1}{F} \qty(n_r + f + m - 2k) =$ \texttt{num\_ex};
+
+  \item the dimension of the cohomology group $H^0$ of the ambient space, \texttt{dim\_h0\_amb};
+
+  \item the Frobenius norm of the matrix, \texttt{norm\_matrix};
+
+  \item the list of the projective space dimensions \texttt{dim\_cp} and statistics thereof (min, max, median, mean);
+
+  \item the list of the equation degrees \texttt{deg\_eqs} and statistics thereof (min, max, median, mean);
+
+  \item $k$-means clustering on the components of the configuration matrix (with a number of clusters going from 2 to 15);\footnotemark{}
+  \footnotetext{%
+    The algorithm determines the centroids of conglomerates of data called \textit{clusters} using an iterative process which computes the distance of each sample from the center of the cluster.
+    It then assigns the label of the cluster to the nearest samples.
+    We use the class \texttt{cluster.KMeans} in \texttt{scikit-learn}.
+  }%
+
+  \item principal components of the configuration matrix derived using a principal components analysis (\pca) with \SI{99}{\percent} of the variance retained (see~\Cref{fig:eda:svd}).
+\end{itemize}
+
+
+\subsubsection{Selection}
+
+
+\paragraph{Correlations}
+
+To get a first general idea it is useful to take a look at the correlation matrix of the features and the labels.\footnotemark{}
+\footnotetext{%
+  The correlation is defined as the ratio between the covariance of two variables $\sigma(x, y) = \sum_{i} (x_i - \bar{x})(y_i - \bar{y})$ and the product of the standard deviations $\sigma(x)\sigma(y)$ (in this case $\bar{x}$ and $\bar{y}$ are the sample means).
+}
+The correlation matrices for the scalar variables are displayed in~\Cref{fig:eda:corr} for the original and favourable datasets (this excludes the configuration matrix).
+As we can see some engineered features are strongly correlated, especially in the favourable dataset.
+In particular \hodge{1}{1} (respectively \hodge{2}{1}) is strongly correlated (respectively anti-correlated) with the number of projective spaces $m$ and with the norm and rank of the matrix.
+This gives a first hint that these variables could help improve predictions by feeding them to the algorithm along with the matrix.
+On the other hand finer information on the number of projective spaces and equations do not correlate with the Hodge numbers.
+
+From this analysis, and in particular from~\Cref{fig:eda:corr}, we find that the values of \hodge{1}{1} and \hodge{2}{1} are also correlated.
+This motivates the simultaneous learning of both Hodge numbers since it can increase chances for the neural network to learn more universal features.
+In fact this is something that often happens in practice: it has been found that multi-tasking enhances the ability to generalise~\cite{Thrun:1996:LearningNthThing, Caruana:1997:MultitaskLearning, Baxter:2000:ModelInductiveBias, Maurer:2016:BenefitMultitaskRepresentation, Ndirango:2019:GeneralizationMultitaskDeep}.
+
+\begin{figure}[tbp]
+  \centering
+  \begin{subfigure}[c]{.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth]{img/corr-matrix_orig}
+    \caption{original dataset}
+  \end{subfigure}
+  \hfill
+  \begin{subfigure}[c]{.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth]{img/corr-matrix_fav}
+    \caption{favourable dataset}
+  \end{subfigure}
+  \caption{Correlations between the engineered scalar features and the labels.}
+  \label{fig:eda:corr}
+\end{figure}
+
+
+\paragraph{Feature importance}
+
+A second non-exclusive option is to sort the features by order of importance.
+This can be done using a decision tree which is capable to determine the weight of each variable towards making a prediction.
+One advantage over correlations is that the algorithm is non-linear and can thus determine subtler relations between the features and labels.
+To avoid biasing the results using only one decision tree, we trained a random forest of trees (using \texttt{ensemble.RandomForestRegressor} in \texttt{scikit-learn}).
+It consists in a large number of decision trees which are trained on different random subsets of the training dataset and averaged over the outputs (see~\Cref{sec:app:trees} for details on the implementation).
+The algorithm determines the importance of the different features to make predictions as a by-product of the learning process: the most relevant features tend to be found at the first branches (or to be used the most) since they are the most important to make the prediction.
+The importance of a variable is a number between $0$ and $1$, and the sum over all of them must be $1$.
+Since a random forest contains many trees the robustness of the variable ranking usually improves with respect to a single tree (\Cref{sec:app:trees}).
+Moreover, as the main objective is to obtain a qualitative preliminary understanding of the features, there is no need for fine tuning at this stage and we use the default parameters (specifically \num{100} decision trees).
+We computed feature importance for both datasets and for two different set of variables: one containing the engineered features and the configuration matrix, and one with the engineered features and the \pca components.
+In the following figures, we show several comparisons of the importance of the features, dividing the figures into scalars, vectors and configuration matrix (or its \pca), and clusters.
+The sum of importance of all features equals $1$.
+
+In~\Cref{fig:eda:scalars}, we show the ranking of the scalar features in the two datasets (differences between the set using the configuration matrix and the other using the \pca are marginal and are not shown to avoid redundant plots).
+As already mentioned we find that the number of projective spaces is the most important feature by far.
+It is followed by the matrix norm in the original dataset, and by the matrix rank for \hodge{2}{1} in the favourable dataset.
+Finally the variable ranking points out that the other features have a negligible impact on the determination of the labels and may as well be ignored during training.
+
+\begin{figure}[tbp]
+  \centering
+  \begin{subfigure}[c]{0.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth, trim={0 0 6in 0}, clip]{img/scalar-features_orig}
+    \caption{original dataset}
+  \end{subfigure}
+  \hfill
+  \begin{subfigure}[c]{0.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth, trim={0 0 6in 0}, clip]{img/scalar-features_fav}
+    \caption{favourable dataset}
+  \end{subfigure}
+  \caption{%
+    Importance of the scalar features in the datasets.
+  }
+  \label{fig:eda:scalars}
+\end{figure}
+
+The same analysis can be repeated for the vector features and the configuration matrix component by component.
+In~\Cref{fig:eda:tensor} we show the cumulative importance of the features (i.e.\ the sum of the importance of each component).
+We can appreciate that the list of the projective space dimensions plays a major role in the determination of the labels in both datasets.
+In the case of \hodge{2}{1} we also have a large contribution from the dimensions of the cohomology group \texttt{dim\_h0\_amb}, as can be expected from algebraic topology~\cite{Hubsch:1992:CalabiyauManifoldsBestiary}.
+
+\begin{figure}[tbp]
+  \centering
+  \begin{subfigure}[c]{\linewidth}
+    \centering
+    \includegraphics[width=\linewidth]{img/vector-tensor-features_orig}
+    \caption{Original dataset}
+  \end{subfigure}
+  \hfill
+  \begin{subfigure}[c]{\linewidth}
+    \centering
+    \includegraphics[width=\linewidth]{img/vector-tensor-features_fav}
+    \caption{Favourable dataset}
+  \end{subfigure}
+  \caption{%
+    Ranking of the vector features and the configuration matrix (or its \pca).
+  }
+  \label{fig:eda:tensor}
+\end{figure}
+
+In~\Cref{fig:eda:cluster} we show the importance associated to the number of clusters used during the \eda: no matter how many clusters we use, their relevance is definitely marginal compared to all other features used in the variable ranking (scalars, vectors, and the configuration matrix or its \pca) for both datasets.
+
+\begin{figure}[tbp]
+  \centering
+  \begin{subfigure}[c]{0.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth, trim={0 0 6in 0}, clip]{img/cluster-features_orig}
+    \caption{Original dataset}
+  \end{subfigure}
+  \hfill
+  \begin{subfigure}[c]{0.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth, trim={0 0 6in 0}, clip]{img/cluster-features_fav}
+    \caption{Favourable dataset}
+  \end{subfigure}
+  \caption{%
+    Incidence of the numbers of clusters on the variable ranking.
+  }
+  \label{fig:eda:cluster}
+\end{figure}
+
+\begin{figure}[tbp]
+  \centering
+  \begin{subfigure}[c]{\linewidth}
+    \centering
+    \includegraphics[width=\linewidth, trim={0 10in 0 0}, clip]{img/distr-labels-corr-feat_orig}
+    \caption{Original dataset}
+  \end{subfigure}
+  \begin{subfigure}[c]{\linewidth}
+    \centering
+    \includegraphics[width=\linewidth, trim={0 10in 0 0}, clip]{img/distr-labels-corr-feat_fav}
+    \caption{Favourable dataset}
+  \end{subfigure}
+  \caption{%
+    Distribution of the labels with respect to the number of projective spaces.
+  }
+  \label{fig:eda:distr}
+\end{figure}
+
+
+\paragraph{Discussion}
+
+It seems therefore that the number of projective spaces plays a relevant role in the determination of \hodge{1}{1} and \hodge{2}{1} as well as the list of dimensions of the projective spaces.
+In order to validate this observation, in~\Cref{fig:eda:distr} we present a scatter plot of the Hodge number distributions versus the number of projective spaces: it shows that there is indeed a linear dependence in $m$ for \hodge{1}{1}, especially in the favourable dataset.
+In fact the only exceptions to this pattern in the latter case are the manifolds which do not have a favourable embedding~\cite{Anderson:2017:FibrationsCICYThreefolds}.
+Hence, a simple data analysis hints naturally towards this mathematical result.
+
+Finally we found other features which may be relevant and are worth to be included in the algorithm: the matrix rank and norm, the list of projective space dimensions and of the associated cohomology dimensions.
+However, we want to emphasize one caveat to this analysis: correlations look only for linear relations, and the random forest has not been optimized or could just be not powerful enough to make good predictions.
+This means that feature selection just gives a hint but it may be necessary to adapt it to different situations.
+
+
+\subsubsection{Removing Outliers}
+\label{sec:data:eda:outliers}
+
+The Hodge number distributions (see \Cref{fig:data:hist-hodge,fig:data:distr}) display few outliers outside the tail of the main distributions.
+Such outliers may negatively impact the learning process and drive down the accuracy: it makes sense to remove them from the training set.
+It is easy to see that the \num{22} outlying manifolds with $\hodge{1}{1} = \hodge{2}{1} = 0$ are product spaces, recognisable from their block-diagonal matrix.
+We will also remove outliers with $\hodge{1}{1} = 19$ and $\hodge{2}{1} > 86$, which represent $15$ and $2$ samples.
+In total this represents $39$ samples, or \SI{0.49}{\percent} of the total data.
+
+To simplify the overall presentation, since the dataset is complete we will mainly focus on the pruned subset of the data obtained by removing outliers, even from the test set.\footnotemark{}
+\footnotetext{%
+  There is no obligation to use a \ml algorithm to label outliers in the training set.
+  It is perfectly fine to decide which data to include or not, even based on targets.
+  However, for a real-world application, outliers in the test set should be labeled by some process based only on the input features.
+  Flagging possible outliers may improve the predictions by helping the machine understand that such samples require more caution.
+}
+This implies that Hodge numbers lie in the ranges $1 \le \hodge{1}{1} \le 16$ and $15 \le \hodge{2}{1} \le 86$.
+Except when stated otherwise, accuracy is indicated for this pruned dataset.
+Obviously the very small percentage of outliers makes the effect of removing them from the test set negligible when stating accuracy.
+
+\begin{figure}[tbp]
+  \centering
+  \begin{subfigure}[c]{0.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth, trim={0 0 6in 0}, clip]{img/label-distribution-compare_orig}
+    \caption{\hodge{1}{1}}
+  \end{subfigure}
+  \qquad
+  \begin{subfigure}[c]{0.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth, trim={6in 0 0 0}, clip]{img/label-distribution-compare_orig}
+    \caption{\hodge{2}{1}}
+  \end{subfigure}
+  
+  \caption{%
+    Summary of the statistics for the distributions of both Hodge numbers.
+    The coloured box shows the three quartiles of the distributions, with the internal horizontal line corresponding to the median.
+    The ``whiskers'' cover the interquartile range, i.e.\ $1.5$ times the distance between the first and third quartiles from the lower and upper limits of the boxes.
+    Isolated points show the remaining outliers which we however choose to keep to avoid excessively pruning the dataset.
+  }
+  \label{fig:data:distr}
+\end{figure}
+
+
+\subsection{Machine Learning Analysis}
+\label{sec:ml}
+
+We compare the performances of different \ml algorithms: linear regression, support vector machines (\svm), random forests, gradient boosted trees and (deep) neural networks.
+We obtain the best results using deep \emph{convolutional} neural networks.
+In fact we present a new neural network architecture, inspired by the Inception model~\cite{Szegedy:2015:GoingDeeperConvolutions, Szegedy:2016:RethinkingInceptionArchitecture, Szegedy:2016:Inceptionv4InceptionresnetImpact} which has been developed in the field of computer vision.
+We provide some details on the different algorithms in~\Cref{app:ml-algo} and refer the reader to the literature~\cite{Bengio:2017:DeepLearning, Chollet:2018:DeepLearningPython, Geron:2019:HandsOnMachineLearning, Skiena:2017:DataScienceDesign, Mehta:2019:HighbiasLowvarianceIntroduction, Carleo:2019:MachineLearningPhysical, Ruehle:2020:DataScienceApplications} for more details.
+
+
+\subsubsection{Feature Extraction}
+\label{sec:ml:selection}
+
+In~\Cref{sec:data:eda} the \eda showed that several engineered features are promising for predicting the Hodge numbers.
+In what follows we compare the performances of various algorithms using different subsets of features:
+\begin{itemize}
+  \item only the configuration matrix (no feature engineering);
+
+  \item only the number of projective spaces $m$;
+
+  \item only a subset of engineered features and not the configuration matrix nor its \pca;
+
+  \item a subset of engineered features and the \pca of the matrix.
+\end{itemize}
+
+Following the \eda and feature engineering, we finally select the features we use in the analysis by choosing the highest ranked features.
+We keep the number of projective spaces (\texttt{num\_cp} in the dataset) and the list of the dimension of the projective spaces (\texttt{dim\_cp}) for both \hodge{1}{1} and \hodge{2}{1}).
+We also include the dimension of the cohomology group of the ambient space \texttt{dim\_h0\_amb} but only for \hodge{2}{1}.\footnotemark{}
+\footnotetext{%
+  Notice that providing different kinds of input features to the algorithm is fine as long as such variables come from the same training set.
+  In other words, it is possible to provide different representations of the same set to different algorithms while retaining the same statistical relevance~\cite{Geron:2019:HandsOnMachineLearning, Skiena:2017:DataScienceDesign}.
+}
+
+
+\subsubsection{Analysis Strategy}
+\label{sec:ml:strategy}
+
+For the \ml analysis, we split the dataset into training and test sets: we fit the algorithms on the first and then show the predictions on the test set, which will not be touched until the algorithms are ready.
+
+
+\paragraph{Test split and validation}
+
+The training set is made of \SI{90}{\percent} of the samples for training, which leaves the remaining \SI{10}{\percent} in the test set (i.e.\ $785$ manifolds out of the $7851$ in the set).\footnotemark{}
+\footnotetext{%
+  Remember that we have removed outliers, see~\Cref{sec:data:eda:outliers}.
+  The interested reader can refer to~\cite{Erbin:2020:InceptionNeuralNetwork} where outliers are kept in the test set.
+}
+For most algorithms, we use \emph{leave-one-out} cross-validation on the training set as evaluation of the algorithm: we subdivide the training set in $9$ subsets, each of them containing \SI{10}{\percent} of the \emph{total} amount of samples, then we train the algorithm on $8$ of them and evaluate it on the $9$th.
+We then repeat the procedure changing the evaluation fold until the algorithm has been trained and evaluated on all of them.
+The performance measure in validation is given by the average over all the left out folds.
+When training neural networks, we will however use a single \emph{holdout validation} set made of \SI{10}{\percent} of the \emph{total} samples.
+
+
+\paragraph{Predictions and metrics}
+
+Since we are interested in predicting exactly the Hodge numbers, the appropriate metric measuring the success of the predictions is the accuracy (for each Hodge number separately):
+\begin{equation}
+  \text{accuracy}
+  =
+  \frac{1}{N} \finitesum{i}{1}{N}
+  \delta_{y^{\text{true}\, (i)},\, y^{\text{pred}\, (i)}},
+\end{equation}
+where $N$ is the number of samples.
+In this analysis the accuracy of the predictions on the test set is rounded to the nearest integer.
+
+Since the Hodge numbers are integers the problem of predicting them looks like a classification task.
+However, as argued in the introduction, we prefer to use a regression approach.
+Indeed regression does not require to specify the data boundaries and allows to extrapolate beyond them, contrary to a classification approach where the categories are fixed at the beginning.\footnotemark{}
+\footnotetext{%
+  A natural way to transform the problem in a regression task is to \emph{standardise} the Hodge numbers, for example by shifting by the mean value and diving by the standard deviation.
+  Under this transformation, the Hodge numbers are mapped to real numbers.
+  While standardisation often improves \ml algorithms, we found that the impact was mild or even negative.
+}
+
+Most algorithms need a differentiable loss function since the optimisation of parameters (such as neural networks weights) uses some variant of the gradient descent method.
+For this reason the accuracy cannot be used and the models are trained by minimisation of the mean squared error (\mse), which is simply the squared $\ell_2$-norm between of the difference between the predictions and the real values.
+There will however be also a restricted number of cases in which we will use either the mean absolute error (\mae), which is the $\ell_1$-norm of the same difference, or a weighted linear combination of \mse and \mae (also known as \textit{Huber} loss): we will point them out at the right time.
+When predicting both Hodge numbers together, the total loss is the sum of each individual loss with equal weight: \hodge{1}{1} is simpler to learn so it is useful to put emphasis on learning \hodge{2}{1}, but the magnitudes of the latter are higher, such that the associated loss is naturally bigger (since we did not normalise the data).
+
+In general predictions are real numbers: we need to turn them into integers.
+In general, rounding to the nearest integer gives the best result, but we found algorithms (such as linear regression) for which flooring to the integer below works better.
+The optimal choice of the integer function is found for each algorithm as part of the hyperparameter optimisation described below.
+The accuracy is computed after rounding.
+
+Learning curves for salient models are displayed.
+They show how the performances of a model improves by using more training data, for fixed hyperparameters.
+To obtain it, we train models using from \SI{10}{\percent} to \SI{90}{\percent} of all the data (``training ratio'') and evaluate the accuracy on the remaining data.\footnotemark{}
+\footnotetext{%
+  Statistics are not provided due to the limitations of our available computational resources, namely a \emph{Thinkpad t470p} laptop with \emph{Intel i7-7700HQ} CPU, \SI{16}{\giga\byte} RAM and \emph{NVidia GeForce 940MX} GPU.
+  However, we check manually on few examples that the reported results are typical.
+}
+
+To avoid redundant information and to avoid cluttering the paper with graphs, the results for models predicting separately the Hodge numbers for the test set are reported in tables, while the results for the models predicting both numbers together are reported in the learning curves.
+For the same reason, the latter are not displayed for the favourable dataset.
+
+
+\paragraph{Visualisation of the performance}
+
+Complementary to the predictions and the accuracy results, we also provide different visualisations of the performance of the models in the form of univariate plots (histograms) and multivariate distributions (scatter plots).
+In fact the usual assumption behind the statistical inference of a distribution is that the difference between the observed data and the predicted values can be modelled by a random variable called \textit{residual}~\cite{Lista:2017:StatisticalMethodsData,Caffo::DataScienceSpecialization}.\footnotemark{}
+\footnotetext{%
+  The difference between the non observable \textit{true} value of the model and the observed data is known as \textit{statistical error}.
+  The difference between residuals and errors is subtle but the two definitions have different interpretations in the context of the regression analysis: in a sense, residuals are an estimate of the errors.
+}
+As such we expect that its values can be sampled from a normal distribution with a constant variance (i.e.\ constant width), since it should not depend on specific observations, and centered around zero, since the regression algorithm tries to minimise the squared difference between observed and predicted values.
+Histograms of the residual errors should therefore exhibit such properties graphically.
+Another interesting kind of visual realisation of the residuals is to show their distribution against the variables used for the regression model: in the case of a simple regression model in one variable, it is customary to plot the residuals as a function of the independent variable, but in a multivariable regression analysis (such as the case at hand) the choice usually falls on the values predicted by the fit (not the observed data).
+We shall therefore plot the residuals as functions of the predicted values.\footnotemark{}
+\footnotetext{
+  We will use the same strategy also for the fit using just the number of projective spaces in order to provide a way to compare the plots across different models.
+}
+Given the assumption of the random distribution of the residuals, they should not present strong correlations with the predictions and should not exhibit trends.
+In general the presence of correlated residuals is an indication of an incomplete or incorrect model which cannot explain the variance of the predicted data, meaning that the model is either not suitable for predictions or that we should add information (i.e.\ add features) to it.
+
+
+\paragraph{Hyperparameter optimisation}
+
+One of the key steps in a \ml analysis is the optimisation of the \emph{hyperparameters} of the algorithm.
+These are internal parameters of each estimator (such as the number of trees in a random forest or the amount of regularisation in a linear model): they are not modified during the training of the model, but they directly influence the process in terms of performance and outcome.
+
+Hyperparameter optimisation is performed by training many models with different choices of their values.
+We then keep the values best performing according to some metric on the validation set(s).\footnotemark{}
+\footnotetext{%
+  Notice the importance of having a validation set separate from the test set: we must avoid adapting the algorithm to the same set we use for the predictions or the generalisation capabilities of the algorithm will suffer.
+}
+As it does not need to be differentiable we use the accuracy as a scoring function to evaluate the models.
+There is however a subtle issue because it is not clear how to combine the accuracy of \hodge{1}{1} and \hodge{2}{1} to get a single metric.
+For this reason we perform the analysis on both Hodge numbers separately.
+Then we can design a single model computing both Hodge numbers simultaneously by making a compromise by hand between the hyperparameters found for the two models computing the Hodge numbers separately.
+The optimisation is implemented using the API in \texttt{scikit-learn}, using the function \texttt{metrics.make\_scorer} and the accuracy as a custom scoring function.
+
+There are several approaches to perform this search automatically, in particular: grid search, random search, genetic evolution, and Bayes optimisation.
+Grid and random search are natively implemented in \texttt{scikit-learn}.
+The first takes a list of possible discrete values of the hyperparameters and will evaluate the algorithm over all possible combinations.
+The second samples the values in both discrete sets and continuous intervals according to some probability distribution, repeating the process a fixed number of times.
+The grid search method is particularly useful for discrete hyperparameters, less refined searches or for a small number of combinations, while the second method can be used to explore the hyperparameter space on a larger scale~\cite{Bergstra:2012:RandomSearchHyperparameter}.
+Genetic algorithms are based on improving the choice of hyperparameters over \emph{generations} that successively select only the most promising values: in general they require a lot of tuning and are easily influenced by the fact that the replication process can also lead to worse results totally at random~\cite{Rudolph:1994:ConvergenceAnalysisCanonical}.
+They are however effective when dealing with very deep or complex neural networks.
+Bayes optimisation~\cite{Snoek:2012:PracticalBayesianOptimization, Shahriari:2015:TakingHumanOut} is a very well established mathematical procedure to find the stationary points of a function without knowing its analytical form~\cite{Mockus:1975:BayesianMethodsSeeking}.
+It relies on assigning a \emph{prior} probability to a given parameter and then multiply it by the probability distribution (or \emph{likelihood}) of the scoring function to compute the probability of finding a better results given a set of hyperparameters.
+This has proven to be very effective in our case and we adopted this solution as it does not require fine tuning and leads to better results for models which are not deep neural networks.
+We choose to use \texttt{scikit-optimize}~\cite{Head:2020:ScikitoptimizeScikitoptimize} whose method \texttt{BayesSearchCV} has a very well implemented Python interface compatible with \texttt{scikit-learn}.
+We will in general perform $50$ iterations of the Bayes search algorithm, unless otherwise specified.
+
+
+\subsection{Linear Models}
+
+Linear models attempt to describe the labels as a linear combinations of the input features while keeping the coefficients at $\order{1}$ (see \Cref{sec:app:linreg}).
+However non-linearity can still be introduced by engineering features which are non-linear in terms of the original data.\footnotemark{}
+\footnotetext{%
+  In general \emph{linear model} is used to indicate that the coefficients $\beta$ of the features appear linearly in the expression of the prediction of the $i$-th label:
+  \begin{equation*}
+    y^{\text{pred}\, (i)}
+    =
+    \beta^{(i)}_0
+    +
+    \beta^{(i)}_1 x^{(i)}_1
+    +
+    \dots
+    +
+    \beta^{(i)}_F x^{(i)}_F
+    =
+    \finitesum{j}{0}{F} \beta^{(i)}_j x^{(i)}_j,
+  \end{equation*}
+  where $m$ is the number of independent variables and $x^{(i)}_0 = 1$ (i.e.\ $\beta^{(i)}_0$ is the intercept of the model and represents the value of the label without the contribution of any of the features).
+  In other words, $\beta^{(i)}_j$ are used with unit exponent only once per model.
+}
+
+From the results of \Cref{sec:data:eda}, we made a hypothesis on the linear dependence of \hodge{1}{1} on the number of projective spaces $m$.
+As a first approach, we can try to fit a linear model to the data as a baseline computation and to test whether there is actual linear correlation between the two quantities.
+We will consider different linear models including their regularised versions.
+
+
+\paragraph{Parameters}
+
+The linear regression is performed with the class \texttt{linear\_model.ElasticNet} in \texttt{scikit-learn}.
+The hyperparameters involved in this case are: the amount of regularisation $\alpha$, the relative ratio (\texttt{l1\_ratio}) between the $\ell_1$ and $\ell_2$ regularization losses, and the fit of the intercept.
+By performing the hyperparameter optimisation we found that $\ell_2$ regularization has a minor impact and can be removed, which corresponds to setting the relative ratio to $1$ (this is equivalent to using \texttt{linear\_model.Lasso}).
+In \Cref{tab:hyp:lin} we show the choices of the hyperparameters for the different models we built using the $\ell_1$ regularised linear regression.
+
+For the original dataset, we floored the predictions to the integers below, while in the favourable we rounded to the next integer.
+This choice for the original dataset makes sense: the majority of the samples lie on the line $\hodge{1}{1} = m$, but there are still many samples with $\hodge{1}{1} > m$ (see \Cref{fig:eda:distr}).
+As a consequence the \ml prediction pulls the line up which can only damage the accuracy.
+Choosing the floor function is a way to counteract this effect.
+Note that accuracy for \hodge{2}{1} is only slightly affected by the choice of rounding, so we just choose the same one as \hodge{1}{1} for simplification.
+
+
+\begin{table}[tbp]
+\centering
+\resizebox{\linewidth}{!}{%
+\begin{tabular}{@{}lccccccccc@{}}
+\toprule
+                                                          &           & \multicolumn{2}{c}{\textbf{matrix}}         & \multicolumn{2}{c}{\textbf{num\_cp}} & \multicolumn{2}{c}{\textbf{eng. feat.}}     & \multicolumn{2}{c}{\textbf{PCA}}            \\ \midrule
+                                                          &           & \textit{old}         & \textit{fav.}        & \textit{old}  & \textit{fav.}        & \textit{old}         & \textit{fav.}        & \textit{old}         & \textit{fav.}        \\ \midrule
+\multirow{2}{*}{$\alpha$}                                 & \hodge{1}{1} & $2.0 \times 10^{-6}$ & $3.0 \times 10^{-5}$ & 0.10          & $2.0 \times 10^{-6}$ & 0.05                 & 0.05                 & 0.07                 & 0.08                 \\
+                                                          & \hodge{2}{1} & $1.0 \times 10^{-6}$ & $1.0 \times 10^{-5}$ & 0.1           & $1.0 \times 10^{-6}$ & $3.0 \times 10^{-4}$ & $1.2 \times 10^{-3}$ & $2.0 \times 10^{-6}$ & $1.2 \times 10^{-3}$ \\ \midrule
+\multirow{2}{*}{\texttt{fit\_intercept}} & \hodge{1}{1} & False                & False                & True          & False                & True                 & True                 & False                & True                 \\
+                                                          & \hodge{2}{1} & True                 & True                 & True          & True                 & True                 & False                & True                 & False                \\ \midrule
+\multirow{2}{*}{\texttt{normalise}}      & \hodge{1}{1} & ---                  & ---                  & False         & ---                  & False                & False                & ---                  & False                \\
+                                                          & \hodge{2}{1} & False                & True                 & False         & False                & False                & ---                  & True                 & ---                  \\ \bottomrule
+\end{tabular}%
+}
+\caption{%
+  Hyperparameter choices of the $\ell_1$ regression model used.
+  In addition to the known hyperparameters $\alpha$ and \texttt{fit\_intercept}, we also include the \texttt{normalise} parameter which indicates whether the samples have been centered and scaled by their $\ell_2$ norm before the fit: it is ignored when the intercept is ignored.
+}
+\label{tab:hyp:lin}
+\end{table}
+
+
+\paragraph{Results}
+
+In~\Cref{tab:res:lin} we show the accuracy for the best hyperparameters.
+For \hodge{1}{1}, the most precise predictions are given by the number of projective spaces which actually confirms the hypothesis of a strong linear dependence of \hodge{1}{1} on the number of projective spaces.
+In fact this gives close to \SI{100}{\percent} accuracy for the favourable dataset which shows that there is no need for more advanced \ml algorithms.
+Moreover adding more engineered features \emph{decreases} the accuracy in most cases where regularization is not appropriate.
+The accuracy for \hodge{2}{1} remains low but including engineered features definitely improves it.
+
+In~\Cref{fig:res:lin} we show the plots of the residual errors of the model on the original dataset.
+For the $\ell_1$ regularised linear model, the univariate plots show that the errors seem to follow normal distributions peaked at $0$ as they generally should: in the case of \hodge{1}{1}, the width is also quite contained.
+The scatter plots instead show that in general there is no correlation between a particular sector of the predictions and the error made by the model, thus the variance of the residuals is in general randomly distributed over the predictions.
+Only the case of the fit of the number of projective spaces seems to show a slight correlation for \hodge{2}{1}, signalling that the model using only one feature might be actually incomplete: in fact it is better to include also other engineered features.
+
+The learning curves in~\Cref{fig:lc:lin} show that the model underfits.
+We also notice that the models are only marginally affected by the number of samples used for training.
+In particular this provides a very strong baseline for \hodge{1}{1}.
+For comparison, we also give the learning curve for the favourable dataset in~\Cref{fig:lc:lin-fav}: this shows that a linear regression is completely sufficient to determine \hodge{1}{1} in that case.
+
+\begin{table}[tbp]
+  \centering
+  \begin{tabular}{@{}cccccc@{}}
+    \toprule
+                            &           & \textbf{matrix} & \textbf{num\_cp} & \textbf{eng. feat.} & \textbf{PCA} \\ \midrule
+      \multirow{2}{*}
+      {\emph{original}}   & \hodge{1}{1} & \SI{51}{\percent}            & \SI{63}{\percent}             & \SI{63}{\percent}                & \SI{64}{\percent}         \\
+                          & \hodge{2}{1} & \SI{11}{\percent}            & \SI{8}{\percent}              & \SI{21}{\percent}                & \SI{21}{\percent}         \\ \midrule
+      \multirow{2}{*}
+      {\emph{favourable}} & \hodge{1}{1} & \SI{95}{\percent}            & \SI{100}{\percent}            & \SI{100}{\percent}               & \SI{100}{\percent}        \\
+                          & \hodge{2}{1} & \SI{14}{\percent}            & \SI{15}{\percent}             & \SI{19}{\percent}                & \SI{19}{\percent}         \\ \bottomrule
+  \end{tabular}
+  \caption{%
+    Best accuracy of the linear model using $\ell_1$ regularisation on the test split.
+  }
+  \label{tab:res:lin}
+\end{table}
+
+\begin{figure}[tbp]
+  \centering
+  \includegraphics[width=\linewidth]{img/lss_reg_orig}
+  \caption{%
+    Plots of the residual error for the $\ell_1$ regularised linear model: rows show the different scenarios (fit with only the matrix, with only the number of projective spaces, with the engineered features, with the engineered features and the \pca).
+    Plots refer to the test split of the original dataset.
+  }
+  \label{fig:res:lin}
+\end{figure}
+
+\begin{figure}[tbp]
+  \centering
+  \begin{subfigure}[c]{0.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth]{img/linreg_learning_curve_matrix_outliers}
+    \caption{input: \texttt{matrix}, $\alpha = \num{2e-4}$}
+  \end{subfigure}
+  \hfill
+  \begin{subfigure}[c]{0.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth]{img/linreg_learning_curve_num_cp_outliers}
+    \caption{input: \texttt{num\_cp}, $\alpha = 1$}
+  \end{subfigure}
+  \caption{%
+    Learning curves for the linear regression (original dataset), including outliers and using a single model for both Hodge numbers.
+  }
+  \label{fig:lc:lin}
+\end{figure}
+
+\begin{figure}[tbp]
+  \centering
+  \includegraphics[width=0.45\linewidth]{img/linreg_learning_curve_num_cp_fav}
+  \caption{%
+    Learning curves for the linear regression (favourable dataset), including outliers and using a single model for both Hodge numbers.
+    Input: \texttt{num\_cp}, $\alpha = 1$.
+  }
+  \label{fig:lc:lin-fav}
+\end{figure}
+
+
+\subsection{Support Vector Machines}
+\label{sec:res:svr}
+
+\svm are a family of algorithms which use a \emph{kernel trick} to map the space of input data vectors into a higher dimensional space where samples can be accurately separated and fitted to an appropriate curve (see~\Cref{sec:app:svr}).
+In this analysis we show two such kernels, namely a linear kernel (also known as \emph{no kernel} since no transformations are involved) and a Gaussian kernel (known as \texttt{rbf} in \ml literature as in \emph{radial basis function}).
+
+
+\subsubsection{Linear Kernel}
+
+For this model we use the class \texttt{svm.LinearSVR} in \texttt{scikit-learn}.
+
+
+\paragraph{Parameters}
+
+In~\Cref{tab:hyp:linsvr} we show the choices of the hyperparameters used for the model.
+As we prove in~\Cref{sec:app:svr}, parameters $C$ and $\epsilon$ are related to the penalty assigned to the samples lying outside the no-penalty boundary (the loss in this case is computed according to the $\ell_1$ or $\ell_2$ norm of the distance from the boundary as specified by the \texttt{loss} hyperparameter).
+Other parameters are related to the use of the intercept to improve the prediction.
+We rounded the predictions to the floor for the original dataset and to the next integer for the favourable dataset.
+
+\begin{table}[tbp]
+\centering
+\resizebox{\linewidth}{!}{%
+\begin{tabular}{@{}lccccccccc@{}}
+\toprule
+                                             &           & \multicolumn{2}{c}{\textbf{matrix}}                           & \multicolumn{2}{c}{\textbf{num\_cp}}                                            & \multicolumn{2}{c}{\textbf{eng. feat.}}                       & \multicolumn{2}{c}{\textbf{PCA}}                                       \\ \midrule
+                                             &           & \textit{old}                  & \textit{fav.}                 & \textit{old}                           & \textit{fav.}                          & \textit{old}                  & \textit{fav.}                 & \textit{old}                           & \textit{fav.}                 \\ \midrule
+\multirow{2}{*}{\texttt{C}}                  & \hodge{1}{1} & 0.13                          & 24                            & 0.001                                  & 0.0010                                 & 0.13                          & 0.001                         & 0.007                                  & 0.4                           \\
+                                             & \hodge{2}{1} & 0.30                          & 100                           & 0.05                                  & 0.0016                                 & 0.5                          & 0.4                         & 1.5                                  & 0.4                           \\ \midrule
+\multirow{2}{*}{$\epsilon$}                  & \hodge{1}{1} & 0.7                           & 0.3                           & 0.4                                    & 0.00                                   & 0.9                           & 0.0                           & 0.5                                    & 0.0                           \\
+                                             & \hodge{2}{1} & 0.0                           & 0.0                           & 10                                   & 0.03                                   & 0.0                           & 0.0                           & 0.0                                    & 0.6                           \\ \midrule
+\multirow{2}{*}{\texttt{fit\_intercept}}     & \hodge{1}{1} & True                          & False                         & True                                   & False                                  & True                          & False                         & False                                  & False                         \\
+                                             & \hodge{2}{1} & True                          & False                         & True                                   & True                                   & True                          & True                          & True                                   & False                         \\ \midrule
+\multirow{2}{*}{\texttt{intercept\_scaling}} & \hodge{1}{1} & 0.13                          & ---                           & 100                                    & ---                                    & 0.01                          & ---                           & ---                                    & ---                           \\
+                                             & \hodge{2}{1} & 100                           & ---                           & 13                                     & 92                                     & 100                        & 0.01                          & 100                                    & ---                           \\ \midrule
+\multirow{2}{*}{\texttt{loss}}               & \hodge{1}{1} & $\abs{\epsilon}$ & $\abs{\epsilon}$ & $\abs{\epsilon}$          & $\norm{\epsilon}^2$ & $\abs{\epsilon}$ & $\abs{\epsilon}$ & $\abs{\epsilon}$ & $\abs{\epsilon}$ \\
+                                             & \hodge{2}{1} & $\abs{\epsilon}$ & $\abs{\epsilon}$ & $\norm{\epsilon}^2$ & $\abs{\epsilon}$          & $\abs{\epsilon}$ & $\abs{\epsilon}$ & $\abs{\epsilon}$          & $\abs{\epsilon}$ \\ \bottomrule
+\end{tabular}%
+}
+\caption{%
+  Hyperparameter choices of the linear \svm regression.
+  The parameter \texttt{intercept\_scaling} is clearly only relevant when the intercept is used.
+  The different losses used simply distinguish between the $\ell_1$ norm of the $\epsilon$-dependent boundary where no penalty is assigned and its $\ell_2$ norm.
+}
+\label{tab:hyp:linsvr}
+\end{table}
+
+
+\paragraph{Results}
+
+In~\Cref{tab:res:linsvr}, we show the accuracy on the test set for the linear kernel.
+As we can see, the performance of the algorithm strongly resembles a linear model in terms of the accuracy reached.
+It is fascinating to notice that the contributions of the \pca do not improve the predictions using just the engineered features: it seems that the latter work better than using the configuration matrix or its principal components.
+
+The residual plots in~\Cref{fig:res:linsvr} confirm what we already said about the linear models with regularisation: the model with only the number of projective spaces shows a tendency to heteroscedasticity which can be balanced by adding more engineered feature, also helping in having more precise predictions (translated into peaked univariate distributions).\footnotemark{}
+\footnotetext{%
+  Heteroscedasticity refers to the tendency to have a correlation between the predictions and the residuals: theoretically speaking, there should not be any, since we suppose the residuals to be independent on the model and normally sampled.
+}
+In all cases, we notice that the model slightly overestimates the real values (residuals are computed as the difference between the prediction and the real value) as the second, small peaks in the histograms for \hodge{1}{1} suggest: this may also explain why flooring the predictions produces the highest accuracy.
+As in general for linear models, the influence of the number of samples used for training is marginal also in this case: we only noticed a decrease in accuracy when also including the \pca or directly the matrix.
+
+\begin{table}[tbp]
+\centering
+\begin{tabular}{@{}cccccc@{}}
+  \toprule
+                          &           & \textbf{matrix} & \textbf{num\_cp} & \textbf{eng. feat.} & \textbf{PCA} \\ \midrule
+    \multirow{2}{*}
+    {\emph{original}}   & \hodge{1}{1} & \SI{61}{\percent}            & \SI{63}{\percent}             & \SI{65}{\percent}                & \SI{62}{\percent}         \\
+                          & \hodge{2}{1} & \SI{11}{\percent}            & \SI{9}{\percent}              & \SI{21}{\percent}                & \SI{20}{\percent}         \\ \midrule
+    \multirow{2}{*}
+    {\emph{favourable}} & \hodge{1}{1} & \SI{96}{\percent}            & \SI{100}{\percent}            & \SI{100}{\percent}               & \SI{100}{\percent}        \\
+                          & \hodge{2}{1} & \SI{14}{\percent}            & \SI{14}{\percent}             & \SI{19}{\percent}                & \SI{20}{\percent}         \\ \bottomrule
+	\end{tabular}
+	\caption{Accuracy of the linear \svm on the test split.}
+	\label{tab:res:linsvr}
+\end{table}
+
+
+\begin{figure}[tbp]
+  \centering
+  \includegraphics[width=0.9\linewidth]{img/lin_svr_orig}
+  \caption{Plots of the residual errors for the \svm with linear kernel.}
+  \label{fig:res:linsvr}
+\end{figure}
+
+
+\subsubsection{Gaussian Kernel}
+
+We then consider \svm using a Gaussian function as kernel.
+The choice of the function can heavily influence the outcome of the predictions since they map the samples into a much higher dimensional space and create highly non-linear combinations of the features before fitting the algorithm.
+In general this can help in the presence of ``obscure'' features which badly correlate one another.
+In our case we hope to leverage the already good correlations we found in the \eda with the kernel trick.
+The implementation is done with the class \texttt{svm.SVR} from \texttt{scikit-learn}.
+
+
+\paragraph{Parameters}
+
+As we show in~\Cref{sec:app:svr}, this particular choice of kernel leads to profoundly different behaviour with respect to linear models: we will round the predictions to the next integer in both datasets since the loss function strongly penalises unaligned samples.
+In~\Cref{tab:hyp:svrrbf} we show the choices of the hyperparameters for the models using the Gaussian kernel.
+As usual the hyperparameter \texttt{C} is connected to the penalty assigned to the samples outside the soft margin boundary (see~\Cref{sec:app:svr}) delimited by the $\epsilon$.
+Given the presence of a non linear kernel we have to introduce an additional hyperparameter $\gamma$ which controls the width of the Gaussian function used for the support vectors.
+
+\begin{table}[tbp]
+\centering
+\begin{tabular}{@{}lccccccccc@{}}
+\toprule
+                            &           & \multicolumn{2}{c}{\textbf{matrix}} & \multicolumn{2}{c}{\textbf{num\_cp}} & \multicolumn{2}{c}{\textbf{eng. feat.}} & \multicolumn{2}{c}{\textbf{PCA}} \\ \midrule
+                            &           & \textit{old}     & \textit{fav.}    & \textit{old}     & \textit{fav.}     & \textit{old}       & \textit{fav.}      & \textit{old}   & \textit{fav.}   \\ \midrule
+\multirow{2}{*}{\texttt{C}} & \hodge{1}{1} & 14               & 1000             & 170              & 36                & 3                  & 40                 & 1.0            & 1000            \\
+                            & \hodge{2}{1} & 40               & 1000             & 1.0              & 1.0               & 84                 & 62                 & 45             & 40              \\ \midrule
+\multirow{2}{*}{$\epsilon$} & \hodge{1}{1} & 0.01             & 0.01             & 0.45             & 0.03              & 0.05               & 0.3                & 0.02           & 0.01            \\
+                            & \hodge{2}{1} & 0.01             & 0.01             & 0.01             & 0.09              & 0.29               & 0.10               & 0.20           & 0.09            \\ \midrule
+\multirow{2}{*}{$\gamma$}   & \hodge{1}{1} & 0.03             & 0.002            & 0.110            & 0.009             & 0.07               & 0.003              & 0.02           & 0.001           \\
+                            & \hodge{2}{1} & 0.06             & 0.100            & 0.013            & 1000              & 0.016              & 0.005              & 0.013          & 0.006           \\ \bottomrule
+\end{tabular}%
+\caption{Hyperparameter choices of the \svm regression with Gaussian kernel.}
+\label{tab:hyp:svrrbf}
+\end{table}
+
+
+\paragraph{Results}
+
+In~\Cref{tab:res:svrrbf} we show the accuracy of the predictions on the test sets.
+In the favourable dataset we can immediately appreciate the strong linear dependence of \hodge{1}{1} on the number of projective spaces: even though there are a few non favourable embeddings in the dataset, the kernel trick is able to map them in a better representation and improve the accuracy.
+The predictions for the original dataset have also improved: they are the best results we found using shallow learning.
+The predictions using only the configuration matrix matches~\cite{Bull:2018:MachineLearningCICY} but we can slightly improve the accuracy by using a combination of engineered features and \pca.
+In~\Cref{fig:res:svrrbf} we show the residual plots and their histograms for the original dataset: residuals follow peaked distributions which, in this case, do not present a second smaller peak (thus we need to round to the next integer the predictions) and good variate distribution over the predictions.
+
+The Gaussian kernel is also more influenced by the size of the training set.
+Using \SI{50}{\percent} of the samples as training set we witnessed a drop in accuracy of \SI{3}{\percent} while using engineered features and the \pca, and around \SI{1}{\percent} to \SI{2}{\percent} in all other cases.
+The learning curves (presented in~\Cref{fig:lc:svrrbf}) show that the accuracy improves by using more data.
+Interestingly, it shows that using all engineered features leads to an overfit on the training data since both Hodge numbers reach almost \SI{100}{\percent}, while this is not the case for \hodge{2}{1}.
+For comparison, we also display in \Cref{fig:lc:svrrbf-fav} the learning curve for the favourable dataset: this shows that predicting \hodge{1}{1} accurately works out-of-the-box.
+
+\begin{table}[tbp]
+\centering
+\begin{tabular}{@{}cccccc@{}}
+  \toprule
+                          &           & \textbf{matrix} & \textbf{num\_cp} & \textbf{eng. feat.} & \textbf{PCA} \\ \midrule
+    \multirow{2}{*}
+    {\emph{original}}   & \hodge{1}{1} & \SI{70}{\percent}            & \SI{63}{\percent}             & \SI{66}{\percent}                & \SI{72}{\percent}         \\
+                          & \hodge{2}{1} & \SI{22}{\percent}            & \SI{10}{\percent}             & \SI{36}{\percent}                & \SI{34}{\percent}         \\ \midrule
+    \multirow{2}{*}
+    {\emph{favourable}} & \hodge{1}{1} & \SI{99}{\percent}            & \SI{100}{\percent}            & \SI{100}{\percent}               & \SI{100}{\percent}        \\
+                          & \hodge{2}{1} & \SI{22}{\percent}            & \SI{17}{\percent}             & \SI{32}{\percent}                & \SI{33}{\percent}         \\ \bottomrule
+\end{tabular}
+\caption{Accuracy of the Gaussian \svm on the test split.}
+\label{tab:res:svrrbf}
+\end{table}
+
+\begin{figure}[tbp]
+  \centering
+  \includegraphics[width=\linewidth]{img/svr_rbf_orig}
+  \caption{Plots of the residual errors for the \svm with Gaussian kernel.}
+  \label{fig:res:svrrbf}
+\end{figure}
+
+
+\begin{figure}[htp]
+  \centering
+  \begin{subfigure}[c]{0.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth]{img/svm_learning_curve_matrix_outliers}
+    \caption{input: \texttt{matrix}, $C = 15, \gamma = 0.03, \epsilon = 0.1$}
+  \end{subfigure}
+  \hfill
+  \begin{subfigure}[c]{0.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth]{img/svm_learning_curve_all_outliers}
+    \caption{input: all, $C = 10, \gamma = 0.03, \epsilon = 0.1$}
+  \end{subfigure}
+  \caption{%
+    Learning curves for the \svm with Gaussian kernel (original dataset), using a single model for both Hodge numbers.
+  }
+  \label{fig:lc:svrrbf}
+\end{figure}
+
+\begin{figure}[tbp]
+  \centering
+  \begin{subfigure}[c]{0.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth]{img/svm_learning_curve_matrix_fav}
+    \caption{input: \texttt{matrix}, $C = 20, \gamma = \mathtt{scale}, \epsilon = 0.1$}
+  \end{subfigure}
+  \hfill
+  \begin{subfigure}[c]{0.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth]{img/svm_learning_curve_num_cp_fav}
+    \caption{input: all, $C = 20, \gamma = \mathtt{scale}, \epsilon = 0.1$}
+  \end{subfigure}
+  \caption{%
+    Learning curves for the \svm with Gaussian kernel (favourable dataset), using a single model for both Hodge numbers.
+  }
+  \label{fig:lc:svrrbf-fav}
+\end{figure}
+
+
+\subsection{Decision Trees}
+\label{sec:ml:trees}
+
+We now consider two algorithms based on decision trees: random forests and gradient boosted trees.
+Decision trees are powerful algorithms which implement a simple decision rule (in the style of an \emph{if\dots then\dots else\dots} statement) to classify or assign a value to the predictions.
+However they have a tendency to adapt too well to the training set and to not be robust enough against small changes in the training data.
+We consider a generalisation of this algorithm used for \emph{ensemble learning}: this is a technique in \ml which uses multiple estimators (they can be the same or different) to improve the performances.
+We will present the results of \emph{random forests} of trees which increase the bias compared to a single decision tree, and \emph{gradient boosted} decision trees, which can use smaller trees to decrease the variance and learn better representations of the input data by iterating their decision functions and use information on the previous runs to improve (see~\Cref{sec:app:trees} for a more in-depth description).
+
+
+\subsubsection{Random Forests}
+
+The random forest algorithm is implemented with Scikit's \texttt{ensemble.RandomForestRegressor}.
+
+%%% TODO %%%
+\paragraph{Parameters}
+
+Hyperparameter tuning for decision trees can in general be quite challenging.
+From the general theory on random forests (\Cref{sec:app:trees}), we can try and look for particular shapes of the trees: this ensemble learning technique usually prefers a small number of fully grown trees.
+We performed only 25 iterations of the optimisation process due to the very long time taken to train all the decision trees.
+
+In \Cref{tab:hyp:rndfor}, we show the hyperparameters used for the predictions.
+As we can see from \texttt{n\_estimator}, random forests are usually built with a small number of fully grown (specified by \texttt{max\_depth} and \texttt{max\_leaf\_nodes}) trees (not always the case, though).
+In order to avoid overfit we also tried to increase the number of samples necessary to split a branch or create a leaf node using \texttt{min\_samples\_leaf} and \texttt{min\_samples\_split} (introducing also a weight on the samples in the leaf nodes specified by \texttt{min\_weight\_fraction\_leaf} to balance the tree).
+Finally the \texttt{criterion} chosen by the optimisation reflects the choice of the trees to measure the impurity of the predictions using either the mean squared error (\texttt{mse}) or the mean absolute error (\texttt{mae}) of the predictions (see \Cref{sec:app:trees}).
+
+
+\begin{table}[htp]
+\centering
+\resizebox{\linewidth}{!}{%
+\begin{tabular}{@{}lccccccccc@{}}
+\toprule
+                                                      &           & \multicolumn{2}{c}{\textbf{matrix}}  & \multicolumn{2}{c}{\textbf{num\_cp}}        & \multicolumn{2}{c}{\textbf{eng. feat.}} & \multicolumn{2}{c}{\textbf{PCA}} \\ \midrule
+                                                      &           & \textit{old}         & \textit{fav.} & \textit{old}         & \textit{fav.}        & \textit{old}   & \textit{fav.}          & \textit{old}   & \textit{fav.}   \\ \midrule
+\multirow{2}{*}{\texttt{criterion}}                            & \hodge{1}{1} & \texttt{mse}         & \texttt{mse}  & \texttt{mae}         & \texttt{mae}         & \texttt{mae}   & \texttt{mse}           & \texttt{mae}   & \texttt{mae}    \\
+                                                      & \hodge{2}{1} & \texttt{mae}         & \texttt{mae}  & \texttt{mae}         & \texttt{mae}         & \texttt{mae}   & \texttt{mae}           & \texttt{mae}   & \texttt{mae}    \\ \midrule
+\multirow{2}{*}{\texttt{max\_depth}}                  & \hodge{1}{1} & 100                  & 100           & 100                  & 30                   & 90             & 30                     & 30             & 60              \\
+                                                      & \hodge{2}{1} & 90                   & 100           & 90                   & 75                   & 100            & 100                    & 100            & 60              \\ \midrule
+\multirow{2}{*}{\texttt{max\_leaf\_nodes}}            & \hodge{1}{1} & 100                  & 80            & 90                   & 20                   & 20             & 35                     & 90             & 90              \\
+                                                      & \hodge{2}{1} & 90                   & 100           & 100                  & 75                   & 100            & 60                     & 100            & 100             \\ \midrule
+\multirow{2}{*}{\texttt{min\_samples\_leaf}}          & \hodge{1}{1} & 1                    & 1             & 1                    & 15                   & 1              & 15                     & 1              & 1               \\
+                                                      & \hodge{2}{1} & 3                    & 1             & 4                    & 70                   & 1              & 70                     & 30             & 1               \\ \midrule
+\multirow{2}{*}{\texttt{min\_samples\_split}}         & \hodge{1}{1} & 2                    & 30            & 20                   & 35                   & 10             & 10                     & 100            & 100             \\
+                                                      & \hodge{2}{1} & 30                   & 2             & 50                   & 45                   & 2              & 100                    & 2              & 100             \\ \midrule
+\multirow{2}{*}{\texttt{min\_weight\_fraction\_leaf}} & \hodge{1}{1} & 0.0                  & 0.0           & 0.0                  & $1.7 \times 10^{-3}$ & 0.0            & 0.009   & 0.0            & 0.0             \\
+                                                      & \hodge{2}{1} & $3.0 \times 10^{-4}$ & 0.0           & $1.0 \times 10^{-4}$ & 0.13                 & 0.0            & 0.0                    & 0.0            & 0.0             \\ \midrule
+\multirow{2}{*}{\texttt{n\_estimators}}               & \hodge{1}{1} & 10                   & 100           & 45                   & 120                  & 155            & 300                    & 10             & 300             \\
+                                                      & \hodge{2}{1} & 190                  & 10            & 160                  & 300                  & 10             & 10                     & 10             & 300             \\ \bottomrule
+\end{tabular}%
+}
+\caption{Hyperparameter choices of the random forest regression.}
+\label{tab:hyp:rndfor}
+\end{table}
+
+
+\paragraph{Results}
+
+In \Cref{tab:res:rndfor}, we summarise the accuracy reached using random forests of decision trees as estimators.
+As we already expected, the contribution of the number of projective spaces helps the algorithm to generate better predictions.
+In general, it seems that the engineered features alone can already provide a good basis for predictions.
+In the case of \hodge{2}{1}, the introduction of the principal components of the configuration matrix also increases the prediction capabilities.
+As in most other cases, we used the floor function for the predictions on the original dataset and the rounding to next integer for the favourable one.
+
+As usual, in \Cref{fig:res:rndfor} we show the histograms of the distribution of the residual errors and the scatter plots of the residuals.
+While the distributions of the errors are slightly wider than the \svm algorithms, the scatter plots of the residual show a strong heteroscedasticity in the case of the fit using the number of projective spaces: though quite accurate, the model is strongly incomplete.
+The inclusion of the other engineered features definitely helps and also leads to better predictions.
+Learning curves are displayed in \Cref{fig:lc:rndfor}.
+
+
+\begin{table}[htp]
+\centering
+\begin{tabular}{@{}cccccc@{}}
+  \toprule
+                          &           & \textbf{matrix} & \textbf{num\_cp} & \textbf{eng. feat.} & \textbf{PCA} \\ \midrule
+    \multirow{2}{*}
+    {\emph{original}}   & \hodge{1}{1} & \SI{55}{\percent}            & \SI{63}{\percent}             & \SI{66}{\percent}                & \SI{64}{\percent}         \\
+                          & \hodge{2}{1} & \SI{12}{\percent}            & \SI{9}{\percent}              & \SI{17}{\percent}                & \SI{18}{\percent}         \\ \midrule
+    \multirow{2}{*}
+    {\emph{favourable}} & \hodge{1}{1} & \SI{89}{\percent}            & \SI{99}{\percent}             & \SI{98}{\percent}                & \SI{98}{\percent}         \\
+                          & \hodge{2}{1} & \SI{14}{\percent}            & \SI{17}{\percent}             & \SI{22}{\percent}                & \SI{27}{\percent}         \\ \bottomrule
+\end{tabular}
+\caption{Accuracy of the random forests on the test split.}
+\label{tab:res:rndfor}
+\end{table}
+
+
+\begin{figure}[htp]
+	\centering
+	\includegraphics[width=0.9\linewidth]{img/rnd_for_orig}
+	\caption{Plots of the residual errors for the random forests.}
+	\label{fig:res:rndfor}
+\end{figure}
+
+
+\begin{figure}[htp]
+	\centering
+
+	\begin{subfigure}[c]{0.45\linewidth}
+		\centering
+		\includegraphics[width=\linewidth]{img/forest_learning_curve_matrix_outliers}
+		\caption{input: \lstinline!matrix!, default parameters}
+	\end{subfigure}
+	\qquad
+	\begin{subfigure}[c]{0.45\linewidth}
+		\centering
+		\includegraphics[width=\linewidth]{img/forest_learning_curve_all_outliers}
+		\caption{input: all, default parameters}
+	\end{subfigure}
+
+	\caption{Learning curves for the random forest (original dataset), including outliers and using a single model for both Hodge numbers.}
+	\label{fig:lc:rndfor}
+\end{figure}
+
+
+\subsubsection{Gradient Boosted Trees}
+
+
+We used the class \lstinline!ensemble.GradientBoostingRegressor! from Scikit in order to implement the gradient boosted trees.
+
+
+\paragraph{Parameters}
+
+Hyperparameter optimisation has been performed using 25 iterations of the Bayes search algorithm since by comparison the gradient boosting algorithms took the longest learning time.
+We show the chosen hyperparameters in \Cref{tab:hyp:grdbst}.
+
+With respect to the random forests, for the gradient boosting we also need to introduce the \texttt{learning\_rate} (or \emph{shrinking parameter}) which controls the gradient descent of the optimisation which is driven by the choice of the \texttt{loss} parameters (\texttt{ls} is the ordinary least squares loss, \texttt{lad} is the least absolute deviation and \texttt{huber} is a combination of the previous two losses weighted by the hyperparameter $\alpha$).
+We also introduce the \texttt{subsample} hyperparameter which chooses a fraction of the samples to be fed into the algorithm at each iteration.
+This procedure has both a regularisation effect on the trees, which should not adapt too much to the training set, and speeds up the training (at least by a very small amount).
+
+
+\begin{table}[htp]
+\centering
+\resizebox{\linewidth}{!}{%
+\begin{tabular}{@{}lccccccccc@{}}
+\toprule
+                                                      &           & \multicolumn{2}{c}{\textbf{matrix}} & \multicolumn{2}{c}{\textbf{num\_cp}}   & \multicolumn{2}{c}{\textbf{eng. feat.}}         & \multicolumn{2}{c}{\textbf{PCA}} \\ \midrule
+                                                      &           & \textit{old}     & \textit{fav.}    & \textit{old}           & \textit{fav.} & \textit{old}           & \textit{fav.}          & \textit{old}   & \textit{fav.}   \\ \midrule
+\multirow{2}{*}{$\alpha$}                             & \hodge{1}{1} & 0.4              & ---              & ---                    & ---           & ---                    & ---                    & ---            & ---             \\
+                                                      & \hodge{2}{1} & ---              & 0.11             & ---                    & ---           & 0.99                   & ---                    & ---            & ---             \\ \midrule
+\multirow{2}{*}{\texttt{criterion}}                   & \hodge{1}{1} & \texttt{mae}     & \texttt{mae}     & \texttt{friedman\_mse} & \texttt{mae}  & \texttt{friedman\_mse} & \texttt{friedman\_mse} & \texttt{mae}   & \texttt{mae}    \\
+                                                      & \hodge{2}{1} & \texttt{mae}     & \texttt{mae}     & \texttt{friedman\_mse} & \texttt{mae}  & \texttt{mae}           & \texttt{mae}           & \texttt{mae}   & \texttt{mae}    \\ \midrule
+\multirow{2}{*}{\texttt{learning\_rate}}              & \hodge{1}{1} & 0.3              & 0.04             & 0.6                    & 0.03          & 0.15                   & 0.5                    & 0.04           & 0.03            \\
+                                                      & \hodge{2}{1} & 0.6              & 0.5              & 0.3                    & 0.5           & 0.04                   & 0.02                   & 0.03           & 0.07            \\ \midrule
+\multirow{2}{*}{\texttt{loss}}                        & \hodge{1}{1} & huber            & ls               & lad                    & ls            & ls                     & lad                    & ls             & ls              \\
+                                                      & \hodge{2}{1} & ls               & huber            & ls                     & ls            & huber                  & ls                     & ls             & lad             \\ \midrule
+\multirow{2}{*}{\texttt{max\_depth}}                  & \hodge{1}{1} & 100              & 100              & 15                     & 60            & 2                      & 100                    & 55             & 2               \\
+                                                      & \hodge{2}{1} & 85               & 100              & 100                    & 30            & 35                     & 60                     & 15             & 2               \\ \midrule
+\multirow{2}{*}{\texttt{min\_samples\_split}}         & \hodge{1}{1} & 2                & 30               & 20                     & 35            & 10                     & 10                     & 100            & 100             \\
+                                                      & \hodge{2}{1} & 30               & 2                & 50                     & 45            & 2                      & 100                    & 2              & 100             \\ \midrule
+\multirow{2}{*}{\texttt{min\_weight\_fraction\_leaf}} & \hodge{1}{1} & 0.03             & 0.0              & 0.0                    & 0.2           & 0.2                    & 0.0                    & 0.06           & 0.0             \\
+                                                      & \hodge{2}{1} & 0.0              & 0.0              & 0.16                   & 0.004         & 0.0                    & 0.0                    & 0.0            & 0.0             \\ \midrule
+\multirow{2}{*}{\texttt{n\_estimators}}               & \hodge{1}{1} & 90               & 240              & 120                    & 220           & 100                    & 130                    & 180            & 290             \\
+                                                      & \hodge{2}{1} & 100              & 300              & 10                     & 20            & 200                    & 300                    & 300            & 300             \\ \midrule
+\multirow{2}{*}{\texttt{subsample}}                   & \hodge{1}{1} & 0.8              & 0.8              & 0.9                    & 0.6           & 0.1                    & 0.1                    & 1.0            & 0.9             \\
+                                                      & \hodge{2}{1} & 0.7              & 1.0              & 0.1                    & 0.9           & 0.1                    & 0.9                    & 0.1            & 0.2             \\ \bottomrule
+\end{tabular}%
+}
+\caption{Hyperparameter choices of the gradient boosted decision trees.}
+\label{tab:hyp:grdbst}
+\end{table}
+
+
+\paragraph{Results}
+
+We show the results of gradient boosting in \Cref{tab:res:grdbst}.
+As usual, the linear dependence of \hodge{1}{1} on the number of projective spaces is evident and in this case also produces the best accuracy result (using the floor function for the original dataset and rounding to the next integer for the favourable dataset) for \hodge{1}{1}.
+\hodge{2}{1} is once again strongly helped by the presence of the redundant features.
+
+In \Cref{fig:res:grdbst}, we finally show the histograms and the scatter plots of the residual errors for the original dataset showing that also in this case the choice of the floor function can be justified and that the addition of the engineered features certainly improves the overall variance of the residuals.
+
+
+\begin{table}[htp]
+\centering
+\begin{tabular}{@{}cccccc@{}}
+  \toprule
+                          &           & \textbf{matrix} & \textbf{num\_cp} & \textbf{eng. feat.} & \textbf{PCA} \\ \midrule
+    \multirow{2}{*}
+    {\emph{original}}   & \hodge{1}{1} & \SI{50}{\percent}            & \SI{63}{\percent}             & \SI{61}{\percent}                & \SI{58}{\percent}         \\
+                          & \hodge{2}{1} & \SI{14}{\percent}            & \SI{9}{\percent}              & \SI{23}{\percent}                & \SI{21}{\percent}         \\ \midrule
+    \multirow{2}{*}
+    {\emph{favourable}} & \hodge{1}{1} & \SI{97}{\percent}            & \SI{100}{\percent}            & \SI{99}{\percent}                & \SI{99}{\percent}         \\
+                          & \hodge{2}{1} & \SI{17}{\percent}            & \SI{16}{\percent}             & \SI{35}{\percent}                & \SI{22}{\percent}         \\ \bottomrule
+\end{tabular}
+\caption{Accuracy of the gradient boosting on the test split.}
+\label{tab:res:grdbst}
+\end{table}
+
+
+\begin{figure}[htp]
+	\centering
+	\includegraphics[width=0.9\linewidth]{img/grd_bst_orig}
+	\caption{Plots of the residual errors for the gradient boosted trees.}
+	\label{fig:res:grdbst}
+\end{figure}
+
+
+\subsection{Neural Networks}
+
+
+In this section we approach the problem of predicting the Hodge numbers using artificial neural networks (ANN), which we briefly review in \Cref{sec:app:nn}.
+We use Google's \emph{Tensorflow} framework and \emph{Keras}, its high-level API, to implement the architectures and train the networks~\cite{Abadi:2015:TensorFlowLargescaleMachine}.
+We explore different architectures and discuss the results.
+
+Differently from the previous algorithms, we do not perform a cross-validation scoring but we simply retain \SI{10}{\percent} of the total set as a holdout validation set (also referred to as \emph{development} set) due to the computation power available.
+Thus, we use \SI{80}{\percent} of the samples for training, \SI{10}{\percent} for evaluation and \SI{10}{\percent} as a test set.
+For the same reason, the optimisation of the algorithm has been performed manually.
+
+We always use the Adam optimiser with default learning rate $\num{e-3}$ to perform the gradient descent and a fix batch size of $32$.
+The network is trained for a large number of epochs to avoid missing possible local optima.
+In order to avoid overshooting the minimum, we dynamically reduce the learning rate both using the \emph{Adam} optimiser, which implements learning rate decay, and through the callback \texttt{callbacks.ReduceLROnPlateau} in Keras, which scales the learning rate by a given factor when the monitored quantity (e.g.\ the validation loss) does not decrease): we choose to reduce it by $0.3$ when the validation loss does not improve for at least $75$ epochs.
+Moreover, we stop training when the validation loss does not improve during $200$ epochs.
+Clearly, we then keep only the weights of the neural networks which gave the best results.
+Batch normalisation layers are used with a momentum of $0. 99$.
+
+Training and evaluation were performed on a \texttt{NVidia GeForce 940MX} laptop GPU with \SI{2}{\giga B} of RAM memory.
+
+
+\subsubsection{Fully Connected Network}
+
+
+First, we reproduce the analysis from~\cite{Bull:2018:MachineLearningCICY} for the prediction of \hodge{1}{1}.
+
+
+\paragraph{Model}
+
+The neural network presented in~\cite{Bull:2018:MachineLearningCICY} for the regression task contains $5$ hidden layers with $876$, $461$, $437$, $929$ and $404$ units (\Cref{fig:nn:dense}).
+All layers (including the output layer) are followed by a ReLU activation and by a dropout layer with a rate of $\num{0.2072}$.
+This network contains roughly $\num{1.58e6}$ parameters.
+
+The other hyperparameters (like the optimiser, batch size, number of epochs, regularisation, etc.) are not mentioned.
+In order to reproduce the results, we have filled the gap as follows:
+\begin{itemize}
+    \item Adam optimiser with batch size of $32$;
+    
+    \item maximal number epochs of $2000$ without early stopping;\footnote{It took around 20 minutes to train the model.}
+    
+    \item we implement learning rate reduction by $0.3$ after $75$ epochs without improvement of the validation loss;
+    
+    \item no $\ell_1$ or $\ell_2$ regularisation;
+    
+    \item a batch normalisation layer~\cite{Ioffe:2015:BatchNormalizationAccelerating} after each fully connected layer.
+\end{itemize}
+
+
+
+\paragraph{Results}
+
+We have first reproduced the results from~\cite{Bull:2018:MachineLearningCICY}, which are summarized in \Cref{tab:res:neuralnet-bull}.
+The training process was very quick and the loss function is reported in \Cref{fig:nn:bull_et_al_loss}.
+We obtain an accuracy of \SI{77}{\percent} both on the development and the test set of the original dataset with \SI{80}{\percent} of training data (see \Cref{tab:res:ann}).
+Using the same network, we also achieved \SI{97}{\percent} of accuracy in the favourable dataset.
+
+
+\begin{figure}[htp]
+    \centering
+    \begin{minipage}[t]{0.475\linewidth}
+        \centering
+        \includegraphics[width=\linewidth]{img/fc}
+        \caption{Architecture of the fully connected network to predict \hodge{1}{1}.
+        For simplicity we do not draw the dropout and batch normalisation layers present after every FC layer.}
+        \label{fig:nn:dense}
+    \end{minipage}
+    \hfill
+    \begin{minipage}[t]{0.475\linewidth}
+        \centering
+        \includegraphics[width=\linewidth, trim={0 0 6in 0}, clip]{img/loss-lr_fc_orig}
+        \caption{Loss function of the FC network in the original dataset.}
+        \label{fig:nn:bull_et_al_loss}
+    \end{minipage}
+\end{figure}
+
+
+\begin{table}[htb]
+        \centering
+        \begin{tabular}{@{}cccccc@{}}
+            \toprule
+            &
+            \multicolumn{5}{c}{\textbf{training data}}
+            \\
+                &
+                    \SI{10}{\percent} &
+                    \SI{30}{\percent} &
+                    \SI{50}{\percent} &
+                    \SI{70}{\percent} &
+                    \SI{90}{\percent}
+                \\
+                \midrule
+                regression &
+                        \SI{58}{\percent} &
+                        \SI{68}{\percent} &
+                        \SI{72}{\percent} &
+                        \SI{75}{\percent} &
+                        \SI{75}{\percent}
+                \\
+                classification &
+                        \SI{68}{\percent} &
+                        \SI{78}{\percent} &
+                        \SI{82}{\percent} &
+                        \SI{85}{\percent} &
+                        \SI{88}{\percent}
+                \\
+                \bottomrule
+        \end{tabular}
+        \caption{Accuracy (approximate) for \hodge{1}{1} obtained in \cite[Figure~1]{Bull:2018:MachineLearningCICY}.}
+        \label{tab:res:neuralnet-bull}
+\end{table}
+
+
+\subsubsection{Convolutional Network}
+
+
+We then present a new purely convolutional network to predict \hodge{1}{1} and \hodge{2}{1}, separately or together.
+The advantage of such networks is that it requires a smaller number of parameters and is insensitive to the size of the inputs.
+The latter point can be helpful to work without padding the matrices (of the same or different representations), but the use of a flatten layer removes this benefit.
+
+
+\paragraph{Model}
+
+The neural network has $4$ convolutional layers.
+They are connected to the output layer with a intermediate flatten layer.
+After each convolutional layer, we use the ReLU activation function and a batch normalisation layer (with momentum 0.99).
+Convolutional layers use the padding option \lstinline!same! and a kernel of size $(5, 5)$ to be able to extract more meaningful representations of the input, treating the configuration matrix somewhat similarly to an object segmentation task~\cite{Peng:2017:LargeKernelMattersa}.
+The output layer is also followed by a ReLU activation in order to force the prediction to be a positive number.
+We use a dropout layer only after the convolutional network (before the flatten layer), but we introduced a combination of $\ell_2$ and $\ell_1$ regularisation to reduce the variance.
+The dropout rate is 0.2 in the original dataset and 0.4 for the favourable dataset, while $\ell_1$ and $\ell_2$ regularisation are set to $10^{-5}$.
+We train the model using the \emph{Adam} optimiser with a starting learning rate of $10^{-3}$ and a mini-batch size of $32$.
+
+The architecture is more similar in style to the old \emph{LeNet} presented for the first time in 1998 by Y.\ LeCun during the ImageNet competition.
+In our implementation, however, we do not include the pooling operations and swap the usual order of batch normalisation and activation function by first putting the ReLU activation.
+
+In \Cref{fig:nn:lenet}, we show the model architecture in the case of the original dataset and of predicting \hodge{1}{1} alone.
+The convolution layers have $180$, $100$, $40$ and $20$ units each.
+
+
+\begin{figure}[htp]
+    \centering
+    \includegraphics[width=0.75\linewidth]{img/ccnn}
+    \caption{%
+		Pure convolutional neural network for redicting \hodge{1}{1}.
+		It is made of $4$ modules composed by convolutional layer, ReLU activation, batch normalisation (in this order), followed by a dropout layer, a flatten layer and the output layer (in this order).
+    }
+    \label{fig:nn:lenet}
+\end{figure}
+
+
+\paragraph{Results}
+
+With this setup, we were able to achieve an accuracy of \SI{94}{\percent} on both the development and the test sets for the ``old'' database and \SI{99}{\percent} for the favourable dataset in both validation and test sets (results are briefly summarised in \Cref{tab:res:ann}).
+We thus improved the results of the densely connected network and proved that convolutional networks can be valuable assets when dealing with the extraction of a good representation of the input data: not only are CNNs very good at recognising patterns and rotationally invariant objects inside pictures or general matrices of data, but deep architectures are also capable of transforming the input using non linear transformations~\cite{Mallat:2016:UnderstandingDeepConvolutional} to create new patterns which can then be used for predictions.
+
+Even though the convolution operation is very time consuming, another advantage of CNN is the extremely reduced number of parameters with respect to FC networks.\footnotemark{}
+\footnotetext{%
+	It took around 4 hours of training (and no optimisation) for each Hodge number in each dataset.
+}%
+The architectures we used were in fact made of approximately $\num{5.8e5}$ parameters: way less than half the number of parameters used in the FC network.
+Ultimately, this leads to a smaller number of training epochs necessary to achieve good predictions (see \Cref{fig:cnn:class-ccnn}).
+
+
+\begin{figure}[htp]
+    \centering
+    \begin{subfigure}[c]{0.45\linewidth}
+      \centering
+      \includegraphics[width=\linewidth, trim={0 0 6in 0}, clip]{img/loss-lr_ccnn_h11_orig}
+      \caption{Loss function of \hodge{1}{1}.}
+    \end{subfigure}
+    \quad
+    \begin{subfigure}[c]{0.45\linewidth}
+      \centering
+      \includegraphics[width=\linewidth, trim={0 0 6in 0}, clip]{img/loss-lr_ccnn_h21_orig}
+      \caption{Loss function of \hodge{2}{1}.}
+    \end{subfigure}
+    \caption{
+		Loss function of the networks for the prediction of \hodge{1}{1} and \hodge{2}{1}.
+		We can see that the validation loss flattens out while the training loss keeps decreasing: we took care of the overfit by using the weights of the network when the validation loss reached its minimum.
+		The use of mini-batch gradient descent also completely spoils the monotonicity of the loss functions which can therefore increase moving from one epoch to the other, while keeping the descending trend for most of its evolution.
+	}
+    \label{fig:cnn:class-ccnn}
+\end{figure}
+
+
+Using this classic setup, we tried different architectures.
+The network for the original dataset seems to work best in the presence of larger kernels, dropping by roughly \SI{5}{\percent} in accuracy when a more ``classical'' $3 \times 3$ kernel is used.
+We also tried to use to set the padding to \lstinline!valid!, reducing the input from a $12 \times 15$ matrix to a $1 \times 1$ feature map over the course of $5$ layers with $180$, $100$, $75$, $40$ and $20$ filters.
+The advantage is the reduction of the number of parameters (namely $\sim \num{4.9e5}$) mainly due to the small FC network at the end, but accuracy dropped to \SI{87}{\percent}.
+The favourable dataset seems instead to be more independent of the specific architecture, retaining accuracy also with smaller kernels.
+
+The analysis for \hodge{2}{1} follows the same prescriptions.
+For both the original and favourable dataset, we opted for 4 convolutional layers with 250, 150, 100 and 50 filters and no FC network for a total amount of $\num{2.1e6}$ parameters.
+
+In this scenario we were able to achieve \SI{36}{\percent} of accuracy in the development set and \SI{40}{\percent} on the test set for \hodge{2}{1} in the ``old'' dataset and \SI{31}{\percent} in both development and test sets in the favourable set (see \Cref{tab:res:ann}).
+
+The learning curves for both Hodge numbers are given in \Cref{fig:lc:class-ccnn}.
+This model uses the same architecture as the one for predicting \hodge{1}{1} only, which explains why it is less accurate as it needs to also adapt to compute \hodge{2}{1} -- a difficult task, as we have seen (see for example \Cref{fig:lc:inception}).
+
+
+\begin{figure}[htp]
+	\centering
+
+	\includegraphics[width=0.6\linewidth]{img/conv_nn_learning_curve}
+
+	\caption{%
+		Learning curves for the classic convolutional neural network (original dataset), using a single model for both Hodge numbers.
+	}
+	\label{fig:lc:class-ccnn}
+\end{figure}
+
+
+\subsubsection{Inception-like Neural Network}
+\label{sec:ml:nn:inception}
+
+
+In the effort to find a better architecture, we took inspiration from Google's winning CNN in the annual \href{https://image-net.org/challenges/LSVRC/}{\emph{ImageNet challenge}} in 2014~\cite{Szegedy:2015:GoingDeeperConvolutions, Szegedy:2016:RethinkingInceptionArchitecture, Szegedy:2016:Inceptionv4InceptionresnetImpact}.
+The architecture presented uses \emph{inception} modules in which separate $3 \times 3$, $5 \times 5$ convolutions are performed side by side (together with \emph{max pooling} operations) before recombining the outputs.
+The modules are then repeated until the output layer is reached.
+This has two evident advantages: users can avoid taking a completely arbitrary decision on the type of convolution to use since the network will take care of it tuning the weights, and the number of parameters is extremely restricted as the network can learn complicated functions using fewer layers.
+As a consequence the architecture of such models can be made very deep while keeping the number of parameters contained, thus being able to learn very difficult representations of the input and producing accurate predictions.
+Moreover, while the training phase might become very long due to the complicated convolutional operations, the small number of parameters is such that predictions can be generated in a very small amount of time, making inception-like models extremely appropriate whenever quick predictions are necessary.
+Another advantage of the architecture is the presence of different kernel sizes inside each module: the network automatically learns features at different scales and different positions, thus leveraging the advantages of a deep architecture with the ability to learn different representations at the same time and compare them.
+
+
+\paragraph{Model}
+
+In \Cref{fig:nn:inception}, we show a schematic of our implementation.
+Differently from the image classification task, we drop the pooling operation and implement two side-by-side convolution over rows ($12 \times 1$ kernel for the original dataset, $15 \times 1$ for the favourable) and one over columns ($1 \times 15$ and $1 \times 18$ respectively).\footnotemark{}
+\footnotetext{%
+	Pooling operations are used to shrink the size of the input.
+	Similar to convolutions, they use a window of a given size to scan the input and select particular values inside.
+	For instance, we could select the average value inside the small portion selected, performing an \emph{average pooling} operation, or the maximum value, a \emph{max pooling} operation.
+	This usually improves image classification and object detection tasks as it can be used to sharpen edges and borders.
+}%
+We use \texttt{same} as padding option.
+The output of the convolutions are then concatenated in the filter dimensions before repeating the ``inception'' module.
+The results from the last module are directly connected to the output layer through a flatten layer.
+In both datasets, we use batch normalisation layers (with momentum $0.99$) after each concatenation layer and a dropout layer (with rate $0.2$) before the FC network.\footnotemark{}
+\footnotetext{%
+	The position of the batch normalisation is extremely important as the parameters computed by such layer directly influence the following batch.
+	We however opted to wait for the scan over rows and columns to finish before normalising the outcome to avoid biasing the resulting activation function.
+}%
+
+For both \hodge{1}{1} and \hodge{2}{1} (in both datasets), we used 3 modules made by 32, 64 and 32 filters for the first Hodge number, and 128, 128 and 64 filters for the second.
+We also included $\ell_1$ and $\ell_2$ regularisation of magnitude $10^{-4}$ in all cases.
+The number of parameters was thus restricted to $\num{2.3e5}$ parameters for \hodge{1}{1} in the original dataset and $\num{2.9e5}$ in the favourable set, and $\num{1.1e6}$ parameters for \hodge{2}{1} in the original dataset and $\num{1.4e6}$ in the favourable dataset.
+In all cases, the number of parameters has decreased by a significant amount: in the case of \hodge{1}{1} they are roughly $\frac{1}{3}$ of the parameters used in the classical CNN and around $\frac{1}{6}$ of those used in the FC network.
+
+For training we used the \emph{Adam} gradient descent with an initial learning rate of $10^{-3}$ and a batch size of $32$.
+The callbacks helped to contain the training time (without optimisation) under 5 hours for each Hodge number in each dataset.
+
+
+\begin{figure}[htp]
+    \centering
+    \includegraphics[width=0.9\linewidth]{img/icnn}
+    \caption{%
+    	In each concatenation module (here shown for the ``old'' dataset) we operate with separate convolution operations over rows and columns, then concatenate the results. The overall architecture is composed of 3 ``inception'' modules made by two separate convolutions, a concatenation layer and a batch normalisation layer (strictly in this order), followed by a dropout layer, a flatten layer and the output layer with ReLU activation (in this order).}
+    \label{fig:nn:inception}
+\end{figure}
+
+
+\paragraph{Results}
+
+With these architectures, we were able to achieve more than \SI{99}{\percent} of accuracy for \hodge{1}{1} in the test set (same for the development set) and \SI{50}{\percent} of accuracy for \hodge{2}{1} (a slightly smaller value for the development set).
+We report the results in \Cref{tab:res:ann}.
+
+We therefore increased the accuracy for both Hodge numbers (especially \hodge{2}{1}) compared to what can achieve a simple sequential network, while at the same time reducing significantly the number of parameters of the network.\footnotemark{}
+This increases the robustness of the method and its generalisation properties.
+\footnotetext{%
+	In an attempt to improve the results for \hodge{2}{1} even further, we also considered to first predict $\ln( 1 + \hodge{2}{1} )$ and then transform it back. However, the predictions dropped by almost \SI{10}{\percent} in accuracy even using the ``inception'' network: the network seems to be able to approximate quite well the results (not better nor worse than simply \hodge{2}{1}) but the subsequent exponentiation is taking apart predictions and true values.
+	Choosing a correct rounding strategy then becomes almost impossible.
+}
+
+In \Cref{fig:nn:inception_errors}, we show the distribution of the residuals and their scatter plot, showing that the distribution of the errors does not present pathological behaviour and the variance of the residuals is well distributed over the predictions.
+
+In fact, this neural network is much more powerful than the previous networks we considered, as can be seen by studying the learning curves (\Cref{fig:lc:inception}).
+When predicting only \hodge{1}{1}, it surpasses \SI{97}{\percent} accuracy using only \SI{30}{\percent} of the data for training.
+While it seems that the predictions suffer when using a single network for both Hodge numbers, this remains much better than any other algorithm.
+It may seem counter-intuitive that convolutions work well on this data since they are not translation or rotation invariant, but only permutation invariant.
+However, convolution alone is not sufficient to ensure invariances under these transformations but it must be supplemented with pooling operations~\cite{Bengio:2017:DeepLearning}, which we do not use.
+Moreover, convolution layers do more than just taking translation properties into account: they allow to make highly complicated combinations of the inputs and to share weights among components, which allow to find subtler patterns than standard fully connected layers.
+This network is more studied in more details in~\cite{Erbin:2020:InceptionNeuralNetwork}.
+
+
+\begin{figure}[htp]
+  \centering
+  \begin{subfigure}[c]{0.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth, trim={0 0 6in 0}, clip]{img/loss-lr_icnn_h11_orig}
+    \caption{Loss of \hodge{1}{1}.}
+  \end{subfigure}
+  \quad
+  \begin{subfigure}[c]{0.45\linewidth}
+    \centering
+    \includegraphics[width=\linewidth, trim={0 0 6in 0}, clip]{img/loss-lr_icnn_h21_orig}
+    \caption{Loss of \hodge{2}{1}.}
+  \end{subfigure}
+  \caption{The loss functions of ``inception'' network for \hodge{1}{1} and \hodge{2}{1} in the original dataset show that the number of epochs required for training is definitely larger than for simpler architectures, despite the reduced number of parameters.}
+  \label{fig:cnn:inception-loss}
+\end{figure}
+
+
+\begin{figure}[htp]
+  \centering
+  \begin{subfigure}[c]{\linewidth}
+    \centering
+    \includegraphics[width=0.8\linewidth]{img/errors_icnn_h11_orig}
+    \caption{Residuals of \hodge{1}{1}.}
+  \end{subfigure}
+  \quad
+  \begin{subfigure}[c]{\linewidth}
+    \centering
+    \includegraphics[width=0.8\linewidth]{img/errors_icnn_h21_orig}
+    \caption{Residuals of \hodge{2}{1}.}
+  \end{subfigure}
+  \caption{Histograms of the residual errors and residual plots of the Inception network.}
+  \label{fig:nn:inception_errors}
+\end{figure}
+
+
+\begin{figure}[htp]
+	\centering
+
+	\begin{subfigure}[c]{0.45\linewidth}
+		\centering
+		\includegraphics[width=\linewidth]{img/inc_nn_learning_curve}
+		\caption{predicting both \hodge{1}{1} and \hodge{2}{1}}
+	\end{subfigure}
+	\qquad
+	\begin{subfigure}[c]{0.45\linewidth}
+		\centering
+		\includegraphics[width=\linewidth]{img/inc_nn_learning_curve_h11}
+		\caption{predicting \hodge{1}{1} only}
+	\end{subfigure}
+
+	\caption{Learning curves for the Inception neural network (original dataset).}
+	\label{fig:lc:inception}
+\end{figure}
+
+
+\begin{table}[htb]
+\centering
+	\begin{tabular}{@{}ccccccc@{}}
+		\toprule
+		& \multicolumn{2}{c}{\textbf{DenseNet}}
+		& \multicolumn{2}{c}{\textbf{classic ConvNet}}
+		& \multicolumn{2}{c}{\textbf{inception ConvNet}}
+		\\
+		& \emph{old} & \emph{fav.}
+		& \emph{old} & \emph{fav.}
+		& \emph{old} & \emph{fav.}
+		\\
+		\midrule
+		\hodge{1}{1}
+		& \SI{77}{\percent}  & \SI{97}{\percent}
+		& \SI{94}{\percent}  & \SI{99}{\percent}
+		& \SI{99}{\percent}  & \SI{99}{\percent}
+		\\
+		\hodge{2}{1}
+		& -     & -
+		& \SI{36}{\percent}  & \SI{31}{\percent}
+		& \SI{50}{\percent}  & \SI{48}{\percent}
+		\\
+		\bottomrule
+\end{tabular}
+\caption{Accuracy using \emph{rint} rounding on the predictions of the ANNs on \hodge{1}{1} and \hodge{2}{1} on the test set.}
+\label{tab:res:ann}
+\end{table}
+
+
+\subsubsection{Boosting the Inception-like Model}
+
+
+To improve further the accuracy of \hodge{2}{1}, we have tried to modify the network by adding engineered features as auxiliary inputs.
+This can be done by adding inputs to the inception neural network and merging the different branches at different stages.
+There are two possibilities to train such a network: 1) train all the network directly, or 2) train the inception network alone, then freeze its weights and connect it to the additional inputs, training only the new layer.
+We found that the architectures we tried did not improve the accuracy, but we briefly describe our attempts for completeness.
+
+We focused in particular on the number of projective spaces, the vector of dimensions of the projective spaces and the vector of dimensions of the principal cohomology group) and predicting \hodge{1}{1} and \hodge{2}{1} at the same time.
+The core of the neural network is the Inception network described in \Cref{sec:ml:nn:inception}.
+Then, the engineered features are processed using fully connected layers and merged to the predictions from the Inception branch using a concatenation layer.
+Obviously, output layers for \hodge{1}{1} and \hodge{2}{1} can be located on different branches, which allow for different processing of the features.
+
+As mentioned earlier, a possible approach is to first train the Inception branch alone, before freezing its weights and connecting it to the rest of the network.
+This can prevent spoiling the already good predictions and speed up the new learning process.
+This is a common technique called \emph{transfer learning}: we can use a model previously trained on a slightly different task and use its weights as part of the new architecture.
+
+Our trials involved shallow fully connected layers ($1$--$3$ layers with $10$ to $150$ units) between the engineered features and after the concatenation layer.
+Since the \eda analysis (\Cref{sec:data:eda}) shows a correlation between both Hodge numbers, we tried architectures where the result for \hodge{1}{1} is used to predict \hodge{2}{1}.
+
+For the training phase, we also tried an alternative to the canonical choice of optimising the sum of the losses.
+We first train the network and stop the process when the validation loss for \hodge{1}{1} does not longer improve, load back the best weights and save the results, keep training and stop when the loss for \hodge{2}{1} reaches a plateau.
+
+
+
+
+With this setup we were able to slightly improve the predictions of \hodge{1}{1} in the original dataset, reaching almost \SI{100}{\percent} of accuracy in the predictions, while the favourable dataset stayed at around \SI{99}{\percent} of accuracy.
+The only few missed predictions (4 manifolds out of 786 in the test set) are in very peculiar regions of the distribution of the Hodge number.
+For \hodge{2}{1} no improvement has been noticed.
+
+
+
+\subsection{Ensemble Learning: Stacking}
+
+
+We conclude the \ml analysis by describing a method very popular in \ml competitions: ensembling.
+This consists in taking several \ml algorithms and combining together the predictions of each individual model to obtain a more precise predictions.
+Using this technique it is possible to decrease the variance and improve generalization by compensating weaknesses of algorithms with strengths of others.
+Indeed, the idea is to put together algorithms which perform best in different zones of the label distribution in order to combine them to build an algorithm better than any individual component.
+
+The simplest such algorithm is \emph{stacking} whose principle is summarised in \Cref{fig:stack:def}.
+First, the original training set is split in two parts (not necessarily even).
+Second, a certain number of \emph{first-level learners} is trained over the first split and used to generate predictions over the second split.
+Third, a ``meta learner'' is trained of the second split to combine the predictions from the first-level learners.
+Predictions for the test set are obtained by applying both level of models one after the other.
+
+We have selected the following models for the first level: linear gression, \svm with the Gaussian kernel, the random forest and the ``inception'' neural network.
+The meta-learner is a simple linear regression with $\ell_1$ regularisation (Lasso).
+The motivations for the first-level algorithms is that stacking works best with a group of algorithms which work in the most diverse way among them.
+
+Also in this case, we use a cross-validation strategy with 5 splits for each level of the training: from \SI{90}{\percent} of total training set, we split into two halves containing each \SI{45}{\percent} of the total samples and then use 5 splits to grade the algorithm, thus using \SI{9}{\percent} of each split for cross correlation at each iteration) and the Bayes optimisation for all algorithms but the ANN (50 iterations for elastic net, \svm and lasso and 25 for the random forests).
+The ANN was trained using a holdout validation set containing the same number of samples as each cross-validation fold, namely \SI{9}{\percent} of the total set.
+The accuracy is then computed as usual using \texttt{numpy.rint} for \svm, neural networks, the meta learner and \hodge{1}{1} in the original dataset in general, and \texttt{numpy.floor} in the other cases.
+
+In \Cref{tab:res:stack}, we show the accuracy of the ensemble learning.
+We notice that accuracy improves slightly only for \hodge{2}{1} (original dataset) compared to the first-level learners.
+However, this is much lower than what has been achieved in \Cref{sec:ml:nn:inception}.
+The reason is that the learning suffers from the reduced size of the training set.
+Another reason is that the different algorithms may perform similarly well in the same regions.
+
+
+\begin{figure}[htp]
+	\centering
+	\includegraphics[width=0.65\linewidth]{img/stacking}
+	\caption{Stacking ensemble learning with two level learning.
+	The original training set is split into two training folds and the first level learners are trained on the first.
+	The trained models are then used to generate a new training set (here the ``1st level labels'') using the second split as input features.
+	The same also applies to the test set.
+	Finally a ``meta-learner'' uses the newly generated training set to produce the final predictions on the test set.}
+	\label{fig:stack:def}
+\end{figure}
+
+
+\begin{table}[htb]
+\centering
+\begin{tabular}{@{}cccccc@{}}
+    \toprule
+    &
+    & \multicolumn{2}{c}{\hodge{1}{1}}
+    & \multicolumn{2}{c}{\hodge{2}{1}}
+    \\
+    &
+    & \emph{old}   & \emph{fav.}
+    & \emph{old}   & \emph{fav.}
+    \\
+    \midrule
+    \multirow{4}{*}{\emph{1st level}}
+    & EN
+        & \SI{65}{\percent}  & \SI{100}{\percent}
+        & \SI{19}{\percent}  & \SI{19}{\percent}
+    \\
+    & \svm
+        & \SI{70}{\percent}  & \SI{100}{\percent}
+        & \SI{30}{\percent}  & \SI{34}{\percent}
+    \\
+    & RF
+        & \SI{61}{\percent}  & \SI{98}{\percent}
+        & \SI{18}{\percent}  & \SI{24}{\percent}
+    \\
+    & ANN
+        & \SI{98}{\percent}  & \SI{98}{\percent}
+        & \SI{33}{\percent}  & \SI{30}{\percent}
+    \\
+    \midrule
+    \multirow{1}{*}{\emph{2nd level}}
+    & Lasso
+        & \SI{98}{\percent}  & \SI{98}{\percent}
+        & \SI{36}{\percent}  & \SI{33}{\percent}
+    \\
+    \bottomrule
+\end{tabular}
+\caption{Accuracy of the first and second level predictions of the stacking ensemble for elastic net regression (EN), support vector with \texttt{rbf} kernel (SVR), random forest (RF) and the artificial neural network (ANN) as first level learners and lasso regression as meta learner.}
+\label{tab:res:stack}
+\end{table}
+
+% vim: ft=tex
diff --git a/thesis.bib b/thesis.bib
index efc9f83..50f4613 100644
--- a/thesis.bib
+++ b/thesis.bib
@@ -386,6 +386,17 @@
   number = {2}
 }
 
+@article{Baxter:2000:ModelInductiveBias,
+  title = {A Model of Inductive Bias Learning},
+  author = {Baxter, Jonathan},
+  date = {2000},
+  journaltitle = {Journal of artificial intelligence research},
+  volume = {12},
+  pages = {149--198},
+  doi = {10.1613/jair.731},
+  file = {/home/riccardo/.local/share/zotero/files/baxter_2000_a_model_of_inductive_bias_learning3.pdf}
+}
+
 @book{Becker:2006:StringTheoryMTheory,
   title = {String {{Theory}} and {{M}}-{{Theory}}: {{A Modern Introduction}}},
   author = {Becker, Katrin and Becker, Melanie and Schwarz, John H.},
@@ -882,6 +893,37 @@
   number = {9}
 }
 
+@article{Carleo:2019:MachineLearningPhysical,
+  title = {Machine Learning and the Physical Sciences},
+  author = {Carleo, Giuseppe and Cirac, Ignacio and Cranmer, Kyle and Daudet, Laurent and Schuld, Maria and Tishby, Naftali and Vogt-Maranto, Leslie and Zdeborová, Lenka},
+  date = {2019},
+  journaltitle = {Reviews of Modern Physics},
+  shortjournal = {Rev. Mod. Phys.},
+  volume = {91},
+  pages = {045002},
+  issn = {0034-6861, 1539-0756},
+  doi = {10.1103/RevModPhys.91.045002},
+  abstract = {Machine learning encompasses a broad range of algorithms and modeling tools used for a vast array of data processing tasks, which has entered most scientific disciplines in recent years. We review in a selective way the recent research on the interface between machine learning and physical sciences. This includes conceptual developments in machine learning (ML) motivated by physical insights, applications of machine learning techniques to several domains in physics, and cross-fertilization between the two fields. After giving basic notion of machine learning methods and principles, we describe examples of how statistical physics is used to understand methods in ML. We then move to describe applications of ML methods in particle physics and cosmology, quantum many body physics, quantum computing, and chemical and material physics. We also highlight research and development into novel computing architectures aimed at accelerating ML. In each of the sections we describe recent successes as well as domain-specific methodology and challenges.},
+  archivePrefix = {arXiv},
+  eprint = {1903.10563},
+  eprinttype = {arxiv},
+  file = {/home/riccardo/.local/share/zotero/files/carleo_et_al_2019_machine_learning_and_the_physical_sciences2.pdf;/home/riccardo/.local/share/zotero/storage/IVAHE4BQ/1903.html},
+  number = {4}
+}
+
+@article{Caruana:1997:MultitaskLearning,
+  title = {Multitask Learning},
+  author = {Caruana, Rich},
+  date = {1997},
+  journaltitle = {Machine learning},
+  volume = {28},
+  pages = {41--75},
+  publisher = {{Springer}},
+  doi = {10.1023/A:1007379606734},
+  file = {/home/riccardo/.local/share/zotero/files/caruana_1997_multitask_learning3.pdf},
+  number = {1}
+}
+
 @inproceedings{Caruana:2006:EmpiricalComparisonSupervised,
   title = {An Empirical Comparison of Supervised Learning Algorithms},
   booktitle = {Proceedings of the 23rd International Conference on {{Machine}} Learning},
@@ -1025,7 +1067,7 @@
 @article{Comsa:2019:SupergravityMagicMachine,
   title = {{{SO}}(8) Supergravity and the Magic of Machine Learning},
   author = {Comsa, Iulia M. and Firsching, Moritz and Fischbacher, Thomas},
-  date = {2019-08},
+  date = {2019},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energ. Phys.},
   volume = {2019},
@@ -1043,7 +1085,7 @@
 @article{Constantin:2019:CountingStringTheory,
   title = {Counting String Theory Standard Models},
   author = {Constantin, Andrei and He, Yang-Hui and Lukas, Andre},
-  date = {2019-05},
+  date = {2019},
   journaltitle = {Physics Letters B},
   shortjournal = {Physics Letters B},
   volume = {792},
@@ -1061,7 +1103,7 @@
 @article{Cornalba:2002:NewCosmologicalScenario,
   title = {A {{New Cosmological Scenario}} in {{String Theory}}},
   author = {Cornalba, Lorenzo and Costa, Miguel S.},
-  date = {2002-09-03},
+  date = {2002},
   journaltitle = {Physical Review D},
   shortjournal = {Phys. Rev. D},
   volume = {66},
@@ -1080,7 +1122,7 @@
   ids = {Cornalba:2004:TimeDependentOrbifolds},
   title = {Time-Dependent Orbifolds and String Cosmology},
   author = {Cornalba, Lorenzo and Costa, Miguel S.},
-  date = {2004-02},
+  date = {2004},
   journaltitle = {Fortschritte der Physik},
   volume = {52},
   pages = {145--199},
@@ -1112,7 +1154,7 @@
   ids = {Craps:2002:StringPropagationPresencea},
   title = {String {{Propagation}} in the {{Presence}} of {{Cosmological Singularities}}},
   author = {Craps, Ben and Kutasov, David and Rajesh, Govindan},
-  date = {2002-06-26},
+  date = {2002},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energy Phys.},
   volume = {2002},
@@ -1132,7 +1174,7 @@
   ids = {Craps:2006:BigBangModelsa},
   title = {Big {{Bang Models}} in {{String Theory}}},
   author = {Craps, Ben},
-  date = {2006-11-07},
+  date = {2006},
   journaltitle = {Classical and Quantum Gravity},
   shortjournal = {Class. Quantum Grav.},
   volume = {23},
@@ -1150,7 +1192,7 @@
 @article{Cremades:2003:YukawaCouplingsIntersecting,
   title = {Yukawa Couplings in Intersecting {{D}}-Brane Models},
   author = {Cremades, Daniel and Ibanez, Luis E. and Marchesano, Fernando},
-  date = {2003-07-16},
+  date = {2003},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energy Phys.},
   volume = {2003},
@@ -1168,7 +1210,7 @@
 @article{Cvetic:2010:BranesInstantonsIntersecting,
   title = {Branes and Instantons Intersecting at Angles},
   author = {Cvetič, Mirjam and García-Etxebarria, Iñaki and Richter, Robert},
-  date = {2010-01},
+  date = {2010},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energ. Phys.},
   volume = {2010},
@@ -1186,7 +1228,7 @@
 @article{DAppollonio:2003:StringInteractionsGravitational,
   title = {String Interactions in Gravitational Wave Backgrounds},
   author = {D'Appollonio, Giuseppe and Kiritsis, Elias},
-  date = {2003-12},
+  date = {2003},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {674},
@@ -1204,7 +1246,7 @@
 @article{DAppollonio:2005:DbranesBCFTHppwave,
   title = {D-Branes and {{BCFT}} in {{Hpp}}-Wave Backgrounds},
   author = {D'Appollonio, Giuseppe and Kiritsis, Elias},
-  date = {2005-04},
+  date = {2005},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {712},
@@ -1222,7 +1264,7 @@
 @article{David:2000:TachyonCondensationD0,
   title = {Tachyon Condensation in the {{D0}}/{{D4}} System},
   author = {David, Justin R.},
-  date = {2000-10-03},
+  date = {2000},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energy Phys.},
   volume = {2000},
@@ -1241,7 +1283,7 @@
 @article{David:2001:TachyonCondensationUsing,
   title = {Tachyon Condensation Using the Disc Partition Function},
   author = {David, Justin R.},
-  date = {2001-07-10},
+  date = {2001},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energy Phys.},
   volume = {2001},
@@ -1259,7 +1301,7 @@
 @article{David:2002:ClosedStringTachyon,
   title = {Closed {{String Tachyon Condensation}} on {{Twisted Circles}}},
   author = {David, Justin R. and Gutperle, Michael and Headrick, Matthew and Minwalla, Shiraz},
-  date = {2002-02-26},
+  date = {2002},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energy Phys.},
   volume = {2002},
@@ -1277,7 +1319,7 @@
 @article{DellaSelva:1970:SimpleExpressionSciuto,
   title = {A Simple Expression for the {{Sciuto}} Three-Reggeon Vertex-Generating Duality},
   author = {Della Selva, Angelo and Saito, Satoru},
-  date = {1970-10},
+  date = {1970},
   journaltitle = {Lettere al Nuovo Cimento},
   shortjournal = {Lett. Nuovo Cimento},
   volume = {4},
@@ -1293,7 +1335,7 @@
   title = {Computational Complexity of the Landscape: {{Part I}}},
   shorttitle = {Computational Complexity of the Landscape},
   author = {Denef, Frederik and Douglas, Michael R.},
-  date = {2007-05},
+  date = {2007},
   journaltitle = {Annals of Physics},
   shortjournal = {Annals of Physics},
   volume = {322},
@@ -1312,7 +1354,7 @@
 @article{DiBartolomeo:1990:GeneralPropertiesVertices,
   title = {General Properties of Vertices with Two {{Ramond}} or Twisted States},
   author = {Di Bartolomeo, Nicola and Di Vecchia, Paolo and Guatieri, R.},
-  date = {1990-12},
+  date = {1990},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {347},
@@ -1340,7 +1382,7 @@
 @article{Dijkstra:2005:ChiralSupersymmetricStandard,
   title = {Chiral Supersymmetric {{Standard Model}} Spectra from Orientifolds of {{Gepner}} Models},
   author = {Dijkstra, T. P. T. and Huiszoon, Lennaert R. and Schellekens, A. N.},
-  date = {2005-03},
+  date = {2005},
   journaltitle = {Physics Letters B},
   shortjournal = {Physics Letters B},
   volume = {609},
@@ -1359,7 +1401,7 @@
 @article{Dijkstra:2005:SupersymmetricStandardModel,
   title = {Supersymmetric Standard Model Spectra from {{RCFT}} Orientifolds},
   author = {Dijkstra, T. P. T. and Huiszoon, Lennaert R. and Schellekens, A. N.},
-  date = {2005-03},
+  date = {2005},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {710},
@@ -1378,7 +1420,7 @@
 @article{DiVecchia:1990:VertexIncludingEmission,
   title = {A Vertex Including Emission of Spin Fields},
   author = {Di Vecchia, Paolo and Madsen, R. and Hornfeck, Klaus and Roland, Kaj},
-  date = {1990-01},
+  date = {1990},
   journaltitle = {Physics Letters B},
   shortjournal = {Physics Letters B},
   volume = {235},
@@ -1393,7 +1435,7 @@
 @article{DiVecchia:1997:ClassicalPbranesBoundary,
   title = {Classical P-Branes from Boundary State},
   author = {Di Vecchia, Paolo and Frau, Marialuisa and Pesando, Igor and Sciuto, Stefano and Lerda, Alberto and Russo, Rodolfo},
-  date = {1997-07},
+  date = {1997},
   journaltitle = {Nuclear Physics B},
   volume = {507},
   pages = {259--276},
@@ -1411,7 +1453,7 @@
   title = {D-Branes in String Theory {{II}}},
   booktitle = {{{YITP}} Workshop on Developments in Superstring and {{M}} Theory},
   author = {Di Vecchia, Paolo and Liccardo, Antonella},
-  date = {1999-12},
+  date = {1999},
   pages = {7--48},
   archivePrefix = {arXiv},
   eprint = {hep-th/9912275},
@@ -1439,7 +1481,7 @@
   title = {Boundary {{State}} for {{Magnetized D9 Branes}} and {{One}}-{{Loop Calculation}}},
   booktitle = {Sense of {{Beauty}} in {{Physics}}: {{Miniconference}} in {{Honor}} of {{Adriano Di Giacomo}} on His 70th {{Birthday}}},
   author = {Di Vecchia, Paolo and Liccardo, Antonella and Marotta, Raffaele and Pezzella, Franco and Pesando, Igor},
-  date = {2006-01},
+  date = {2006},
   abstract = {We construct the boundary state describing magnetized D9 branes in R\^\{3,1\} x T\^6 and we use it to compute the annulus and Moebius amplitudes. We derive from them, by using open/closed string duality, the number of Landau levels on the torus T\^d.},
   archivePrefix = {arXiv},
   eprint = {hep-th/0601067},
@@ -1451,7 +1493,7 @@
   ids = {DiVecchia:2007:WrappedMagnetizedBranesa},
   title = {Wrapped Magnetized Branes: Two Alternative Descriptions?},
   author = {Di Vecchia, Paolo and Liccardo, Antonella and Marotta, Raffaele and Pezzella, Franco and Pesando, Igor},
-  date = {2007-12},
+  date = {2007},
   journaltitle = {Journal of High Energy Physics},
   volume = {2007},
   pages = {100--100},
@@ -1471,7 +1513,7 @@
 @article{DiVecchia:2011:OpenStringsSystem,
   title = {Open Strings in the System {{D5}}/{{D9}}},
   author = {Di Vecchia, Paolo and Marotta, Raffaele and Pesando, Igor and Pezzella, Franco},
-  date = {2011-06-17},
+  date = {2011},
   journaltitle = {Journal of Physics A: Mathematical and Theoretical},
   shortjournal = {J. Phys. A: Math. Theor.},
   volume = {44},
@@ -1489,7 +1531,7 @@
 @article{Dixon:1985:StringsOrbifolds,
   title = {Strings on Orbifolds},
   author = {Dixon, Lance J. and Harvey, Jeffrey A. and Vafa, Cumrun and Witten, Edward},
-  date = {1985-01},
+  date = {1985},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {261},
@@ -1504,7 +1546,7 @@
 @article{Dixon:1986:StringsOrbifoldsII,
   title = {Strings on Orbifolds ({{II}})},
   author = {Dixon, Lance J. and Harvey, Jeffrey A. and Vafa, Cumrun and Witten, Edward},
-  date = {1986-09},
+  date = {1986},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {274},
@@ -1604,7 +1646,7 @@
 @article{Duo:2007:NewTwistField,
   title = {New Twist Field Couplings from the Partition Function for Multiply Wrapped {{D}}-Branes},
   author = {Duo, Dario and Russo, Rodolfo and Sciuto, Stefano},
-  date = {2007-12-12},
+  date = {2007},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energy Phys.},
   volume = {2007},
@@ -1622,7 +1664,7 @@
 @article{Engberg:1993:AlgorithmComputingFourRamond,
   title = {An Algorithm for Computing Four-{{Ramond}} Vertices at Arbitrary Level},
   author = {Engberg, Niclas and Nilsson, Bengt E.W. and Sundell, Per},
-  date = {1993-08},
+  date = {1993},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {404},
@@ -1638,7 +1680,7 @@
 @online{Erbin:2018:GANsGeneratingEFT,
   title = {{{GANs}} for Generating {{EFT}} Models},
   author = {Erbin, Harold and Krippendorf, Sven},
-  date = {2018-09-06},
+  date = {2018},
   abstract = {We initiate a way of generating models by the computer, satisfying both experimental and theoretical constraints. In particular, we present a framework which allows the generation of effective field theories. We use Generative Adversarial Networks to generate these models and we generate examples which go beyond the examples known to the machine. As a starting point, we apply this idea to the generation of supersymmetric field theories. In this case, the machine knows consistent examples of supersymmetric field theories with a single field and generates new examples of such theories. In the generated potentials we find distinct properties, here the number of minima in the scalar potential, with values not found in the training data. We comment on potential further applications of this framework.},
   archivePrefix = {arXiv},
   eprint = {1809.02612},
@@ -1651,7 +1693,7 @@
   ids = {Erbin:2020:InceptionNeuralNetworka},
   title = {Inception {{Neural Network}} for {{Complete Intersection Calabi}}-{{Yau}} 3-Folds},
   author = {Erbin, Harold and Finotello, Riccardo},
-  date = {2020-07-27},
+  date = {2020},
   url = {http://arxiv.org/abs/2007.13379},
   urldate = {2020-08-06},
   abstract = {We introduce a neural network inspired by Google's Inception model to compute the Hodge number \$h\^\{1,1\}\$ of complete intersection Calabi-Yau (CICY) 3-folds. This architecture improves largely the accuracy of the predictions over existing results, giving already 97\% of accuracy with just 30\% of the data for training. Moreover, accuracy climbs to 99\% when using 80\% of the data for training. This proves that neural networks are a valuable resource to study geometric aspects in both pure mathematics and string theory.},
@@ -1667,7 +1709,7 @@
   title = {Machine Learning for Complete Intersection {{Calabi}}-{{Yau}} Manifolds: A Methodological Study},
   shorttitle = {Machine Learning for Complete Intersection {{Calabi}}-{{Yau}} Manifolds},
   author = {Erbin, Harold and Finotello, Riccardo},
-  date = {2020-07-30},
+  date = {2020},
   url = {http://arxiv.org/abs/2007.15706},
   urldate = {2020-08-06},
   abstract = {We revisit the question of predicting both Hodge numbers \$h\^\{1,1\}\$ and \$h\^\{2,1\}\$ of complete intersection Calabi-Yau (CICY) 3-folds using machine learning (ML), considering both the old and new datasets built respectively by Candelas-Dale-Lutken-Schimmrigk / Green-H\textbackslash "ubsch-Lutken and by Anderson-Gao-Gray-Lee. In real world applications, implementing a ML system rarely reduces to feed the brute data to the algorithm. Instead, the typical workflow starts with an exploratory data analysis (EDA) which aims at understanding better the input data and finding an optimal representation. It is followed by the design of a validation procedure and a baseline model. Finally, several ML models are compared and combined, often involving neural networks with a topology more complicated than the sequential models typically used in physics. By following this procedure, we improve the accuracy of ML computations for Hodge numbers with respect to the existing literature. First, we obtain 97\% (resp. 99\%) accuracy for \$h\^\{1,1\}\$ using a neural network inspired by the Inception model for the old dataset, using only 30\% (resp. 70\%) of the data for training. For the new one, a simple linear regression leads to almost 100\% accuracy with 30\% of the data for training. The computation of \$h\^\{2,1\}\$ is less successful as we manage to reach only 50\% accuracy for both datasets, but this is still better than the 16\% obtained with a simple neural network (SVM with Gaussian kernel and feature engineering and sequential convolutional network reach at best 36\%). This serves as a proof of concept that neural networks can be valuable to study the properties of geometries appearing in string theory.},
@@ -1682,7 +1724,7 @@
 @article{Erler:1993:HigherTwistedSector,
   title = {Higher {{Twisted Sector Couplings}} of {{Z}}{{{\textsubscript{N}}}} {{Orbifolds}}},
   author = {Erler, Jens and Jungnickel, Dirk-U. and Spaliński, Michał and Stieberger, Stephan},
-  date = {1993-05},
+  date = {1993},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {397},
@@ -1717,7 +1759,7 @@
 @article{Faraggi:2020:MachineLearningClassification,
   title = {Towards Machine Learning in the Classification of {{Z}}{\textsubscript{2}} × {{Z}}{\textsubscript{2}} Orbifold Compactifications},
   author = {Faraggi, A E and Harries, G and Percival, B and Rizos, J},
-  date = {2020-08},
+  date = {2020},
   journaltitle = {Journal of Physics: Conference Series},
   shortjournal = {J. Phys.: Conf. Ser.},
   volume = {1586},
@@ -1761,7 +1803,7 @@
 @article{Figueroa-OFarrill:2001:GeneralisedSupersymmetricFluxbranes,
   title = {Generalised Supersymmetric Fluxbranes},
   author = {Figueroa-O'Farrill, José and Simón, Joan},
-  date = {2001-12-10},
+  date = {2001},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energy Phys.},
   volume = {2001},
@@ -1779,7 +1821,7 @@
 @online{Finotello:2019:2DFermionStrip,
   title = {{{2D Fermion}} on the {{Strip}} with {{Boundary Defects}} as a {{CFT}} with {{Excited Spin Fields}}},
   author = {Finotello, Riccardo and Pesando, Igor},
-  date = {2019-12-16},
+  date = {2019},
   url = {http://arxiv.org/abs/1912.07617},
   urldate = {2020-02-27},
   abstract = {We consider a two-dimensional fermion on the strip in the presence of an arbitrary number of zero-dimensional boundary changing defects. We show that the theory is still conformal with time dependent stress-energy tensor and that the allowed defects can be understood as excited spin fields. Finally we compute correlation functions involving these excited spin fields without using bosonization.},
@@ -1795,7 +1837,7 @@
   ids = {Finotello:2019:ClassicalSolutionBosonica},
   title = {The {{Classical Solution}} for the {{Bosonic String}} in the {{Presence}} of {{Three D}}-Branes {{Rotated}} by {{Arbitrary SO}}(4) {{Elements}}},
   author = {Finotello, Riccardo and Pesando, Igor},
-  date = {2019-04},
+  date = {2019},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {941},
@@ -1813,7 +1855,7 @@
 @article{Forste:2018:YukawaCouplingsMagnetized,
   title = {Yukawa Couplings from Magnetized {{D}}-Brane Models on Non-Factorisable Tori},
   author = {Forste, Stefan and Liyanage, Christoph},
-  date = {2018-08},
+  date = {2018},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energ. Phys.},
   volume = {2018},
@@ -1831,7 +1873,7 @@
 @article{Frampton:2001:ClassificationConformalityModels,
   title = {Classification of {{Conformality Models Based}} on {{Nonabelian Orbifolds}}},
   author = {Frampton, Paul H. and Kephart, Thomas W.},
-  date = {2001-09-27},
+  date = {2001},
   journaltitle = {Physical Review D},
   shortjournal = {Phys. Rev. D},
   volume = {64},
@@ -1849,7 +1891,7 @@
 @article{Friedan:1986:ConformalInvarianceSupersymmetry,
   title = {Conformal Invariance, Supersymmetry and String Theory},
   author = {Friedan, Daniel and Martinec, Emil and Shenker, Stephen},
-  date = {1986-01},
+  date = {1986},
   journaltitle = {Nuclear Physics B},
   volume = {271},
   pages = {93--165},
@@ -1896,7 +1938,7 @@
 @article{Gan:2017:HolographyDeepLearning,
   title = {Holography as Deep Learning},
   author = {Gan, Wen-Cong and Shu, Fu-Wen},
-  date = {2017-10},
+  date = {2017},
   journaltitle = {International Journal of Modern Physics D},
   shortjournal = {Int. J. Mod. Phys. D},
   volume = {26},
@@ -1916,7 +1958,7 @@
 @article{Gato:1990:VertexOperatorsNonabelian,
   title = {Vertex Operators, Non-Abelian Orbifolds and the {{Reimann}}-{{Hilbert}} Problem},
   author = {Gato, Beatriz},
-  date = {1990-04},
+  date = {1990},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {334},
@@ -1933,7 +1975,7 @@
 @article{Gava:1997:BoundStatesBranes,
   title = {On the {{Bound States}} of P- and (P+2)-{{Branes}}},
   author = {Gava, Edi and Narain, Kumar S. and Sarmadi, Hossein M.},
-  date = {1997-10},
+  date = {1997},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {504},
@@ -1964,7 +2006,7 @@
 @online{Ginsparg:1988:AppliedConformalField,
   title = {Applied {{Conformal Field Theory}}},
   author = {Ginsparg, Paul},
-  date = {1988-11},
+  date = {1988},
   abstract = {These lectures consisted of an elementary introduction to conformal field theory, with some applications to statistical mechanical systems, and fewer to string theory. Contents: 1. Conformal theories in d dimensions 2. Conformal theories in 2 dimensions 3. The central charge and the Virasoro algebra 4. Kac determinant and unitarity 5. Identication of m = 3 with the critical Ising model 6. Free bosons and fermions 7. Free fermions on a torus 8. Free bosons on a torus 9. Affine Kac-Moody algebras and coset constructions 10. Advanced applications},
   archivePrefix = {arXiv},
   eprint = {hep-th/9108028},
@@ -1987,7 +2029,7 @@
   title = {One in a Billion: {{MSSM}}-like {{D}}-Brane Statistics},
   shorttitle = {One in a Billion},
   author = {Gmeiner, Florian and Blumenhagen, Ralph and Honecker, Gabriele and Lüst, Dieter and Weigand, Timo},
-  date = {2006-01-04},
+  date = {2006},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energy Phys.},
   volume = {2006},
@@ -2005,7 +2047,7 @@
 @article{Goddard:1973:QuantumDynamicsMassless,
   title = {Quantum Dynamics of a Massless Relativistic String},
   author = {Goddard, Peter and Goldstone, Jeffrey and Rebbi, Claudio and Thorn, Charles B.},
-  date = {1973-05},
+  date = {1973},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {56},
@@ -2023,7 +2065,7 @@
   title = {Flux Compactifications in String Theory: {{A}} Comprehensive Review},
   shorttitle = {Flux Compactifications in String Theory},
   author = {Graña, Mariana},
-  date = {2006-01},
+  date = {2006},
   journaltitle = {Physics Reports},
   shortjournal = {Physics Reports},
   volume = {423},
@@ -2053,7 +2095,7 @@
 @article{Green:1987:CalabiYauManifoldsComplete,
   title = {Calabi-{{Yau}} Manifolds as Complete Intersections in Products of Complex Projective Spaces},
   author = {Green, Paul and Hübsch, Tristan},
-  date = {1987-03},
+  date = {1987},
   journaltitle = {Communications in Mathematical Physics},
   shortjournal = {Commun.Math. Phys.},
   volume = {109},
@@ -2067,7 +2109,7 @@
 @article{Green:1987:PolynomialDeformationsCohomology,
   title = {Polynomial Deformations and Cohomology of {{Calabi}}-{{Yau}} Manifolds},
   author = {Green, Paul and Hübsch, Tristan},
-  date = {1987-09},
+  date = {1987},
   journaltitle = {Communications in Mathematical Physics},
   shortjournal = {Commun.Math. Phys.},
   volume = {113},
@@ -2103,7 +2145,7 @@
 @article{Green:1989:AllHodgeNumbers,
   title = {All the {{Hodge}} Numbers for All {{Calabi}}-{{Yau}} Complete Intersections},
   author = {Green, Paul S. and Hübsch, Tristan and Lütken, Carsten Andrew},
-  date = {1989-02-01},
+  date = {1989},
   journaltitle = {Classical and Quantum Gravity},
   shortjournal = {Class. Quantum Grav.},
   volume = {6},
@@ -2118,7 +2160,7 @@
 @online{Greene:1997:StringTheoryCalabiYau,
   title = {String {{Theory}} on {{Calabi}}-{{Yau Manifolds}}},
   author = {Greene, Brian},
-  date = {1997-02},
+  date = {1997},
   url = {http://arxiv.org/abs/hep-th/9702155},
   urldate = {2020-09-03},
   abstract = {These lectures are devoted to introducing some of the basic features of quantum geometry that have been emerging from compactified string theory over the last couple of years. The developments discussed include new geometric features of string theory which occur even at the classical level as well as those which require non-perturbative effects. These lecture notes are based on an evolving set of lectures presented at a number of schools but most closely follow a series of seven lectures given at the TASI-96 summer school on Strings, Fields and Duality.},
@@ -2132,7 +2174,7 @@
 @article{Grimm:2005:EffectiveActionType,
   title = {The Effective Action of Type {{IIA Calabi}}–{{Yau}} Orientifolds},
   author = {Grimm, Thomas W. and Louis, Jan},
-  date = {2005-07},
+  date = {2005},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {718},
@@ -2149,7 +2191,7 @@
 @article{Halverson:2017:AlgorithmicUniversalityFtheory,
   title = {Algorithmic Universality in {{F}}-Theory Compactifications},
   author = {Halverson, James and Long, Cody and Sung, Benjamin},
-  date = {2017-12-11},
+  date = {2017},
   journaltitle = {Physical Review D},
   shortjournal = {Phys. Rev. D},
   volume = {96},
@@ -2169,7 +2211,7 @@
   title = {{{TASI Lectures}} on {{Remnants}} from the {{String Landscape}}},
   booktitle = {Proceedings of {{Theoretical Advanced Study Institute Summer School}} 2017 "{{Physics}} at the {{Fundamental Frontier}}" — {{PoS}}({{TASI2017}})},
   author = {Halverson, James and Langacker, Paul},
-  date = {2018-02-23},
+  date = {2018},
   pages = {019},
   publisher = {{Sissa Medialab}},
   location = {{Boulder, Colorado}},
@@ -2187,7 +2229,7 @@
   title = {Branes with Brains: Exploring String Vacua with Deep Reinforcement Learning},
   shorttitle = {Branes with Brains},
   author = {Halverson, James and Nelson, Brent and Ruehle, Fabian},
-  date = {2019-06},
+  date = {2019},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energ. Phys.},
   volume = {2019},
@@ -2205,7 +2247,7 @@
 @article{Halverson:2019:ComputationalComplexityVacua,
   title = {Computational Complexity of Vacua and Near-Vacua in Field and String Theory},
   author = {Halverson, James and Ruehle, Fabian},
-  date = {2019-02-22},
+  date = {2019},
   journaltitle = {Physical Review D},
   shortjournal = {Phys. Rev. D},
   volume = {99},
@@ -2224,7 +2266,7 @@
 @article{Halverson:2020:StatisticalPredictionsString,
   title = {Statistical {{Predictions}} in {{String Theory}} and {{Deep Generative Models}}},
   author = {Halverson, James and Long, Cody},
-  date = {2020-05},
+  date = {2020},
   journaltitle = {Fortschritte der Physik},
   shortjournal = {Fortschr. Phys.},
   volume = {68},
@@ -2243,7 +2285,7 @@
 @article{Hashimoto:2003:RecombinationIntersectingDbranes,
   title = {Recombination of {{Intersecting D}}-Branes by {{Local Tachyon Condensation}}},
   author = {Hashimoto, Koji and Nagaoka, Satoshi},
-  date = {2003-06-18},
+  date = {2003},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energy Phys.},
   volume = {2003},
@@ -2261,7 +2303,7 @@
 @article{Hashimoto:2018:DeepLearningAdS,
   title = {Deep Learning and the {{AdS}} / {{CFT}} Correspondence},
   author = {Hashimoto, Koji and Sugishita, Sotaro and Tanaka, Akinori and Tomiya, Akio},
-  date = {2018-08-27},
+  date = {2018},
   journaltitle = {Physical Review D},
   shortjournal = {Phys. Rev. D},
   volume = {98},
@@ -2281,7 +2323,7 @@
 @article{Hashimoto:2018:DeepLearningHolographic,
   title = {Deep Learning and Holographic {{QCD}}},
   author = {Hashimoto, Koji and Sugishita, Sotaro and Tanaka, Akinori and Tomiya, Akio},
-  date = {2018-11-14},
+  date = {2018},
   journaltitle = {Physical Review D},
   shortjournal = {Phys. Rev. D},
   volume = {98},
@@ -2300,7 +2342,7 @@
 @article{Hashimoto:2019:AdSCFTCorrespondence,
   title = {{{AdS}} / {{CFT}} Correspondence as a Deep {{Boltzmann}} Machine},
   author = {Hashimoto, Koji},
-  date = {2019-05-31},
+  date = {2019},
   journaltitle = {Physical Review D},
   shortjournal = {Phys. Rev. D},
   volume = {99},
@@ -2319,7 +2361,7 @@
 @article{He:2017:MachinelearningStringLandscape,
   title = {Machine-Learning the String Landscape},
   author = {He, Yang-Hui},
-  date = {2017-11},
+  date = {2017},
   journaltitle = {Physics Letters B},
   shortjournal = {Physics Letters B},
   volume = {774},
@@ -2334,7 +2376,7 @@
 @article{He:2019:DistinguishingEllipticFibrations,
   title = {Distinguishing Elliptic Fibrations with {{AI}}},
   author = {He, Yang-Hui and Lee, Seung-Joo},
-  date = {2019-11},
+  date = {2019},
   journaltitle = {Physics Letters B},
   shortjournal = {Physics Letters B},
   volume = {798},
@@ -2352,7 +2394,7 @@
 @online{He:2020:CalabiYauSpacesString,
   title = {Calabi-{{Yau Spaces}} in the {{String Landscape}}},
   author = {He, Yang-Hui},
-  date = {2020-06},
+  date = {2020},
   abstract = {Calabi-Yau spaces, or Kahler spaces admitting zero Ricci curvature, have played a pivotal role in theoretical physics and pure mathematics for the last half-century. In physics, they constituted the first and natural solution to compactification of superstring theory to our 4-dimensional universe, primarily due to one of their equivalent definitions being the admittance of covariantly constant spinors. Since the mid-1980s, physicists and mathematicians have joined forces in creating explicit examples of Calabi-Yau spaces, compiling databases of formidable size, including the complete intersecion (CICY) dataset, the weighted hypersurfaces dataset, the elliptic-fibration dataset, the Kreuzer-Skarke toric hypersurface dataset, generalized CICYs etc., totaling at least on the order of 10\^10 manifolds. These all contribute to the vast string landscape, the multitude of possible vacuum solutions to string compactification. More recently, this collaboration has been enriched by computer science and data science, the former, in bench-marking the complexity of the algorithms in computing geometric quantities and the latter, in applying techniques such as machine-learning in extracting unexpected information. These endeavours, inspired by the physics of the string landscape, have rendered the investigation of Calabi-Yau spaces one of the most exciting and inter-disciplinary fields. Invited contribution to the Oxford Research Encyclopedia of Physics, B.\textasciitilde Foster Ed., OUP, 2020},
   archivePrefix = {arXiv},
   eprint = {2006.16623},
@@ -2364,7 +2406,7 @@
 @software{Head:2020:ScikitoptimizeScikitoptimize,
   title = {Scikit-Optimize/Scikit-Optimize},
   author = {Head, Tim and Kumar, Manoj and Nahrstaedt, Holger and Louppe, Gilles and Shcherbatyi, Iaroslav},
-  date = {2020-09},
+  date = {2020},
   doi = {10.5281/zenodo.4014775},
   organization = {{Zenodo}},
   version = {v0.8.1}
@@ -2375,7 +2417,7 @@
   title = {Random Decision Forests},
   booktitle = {Proceedings of 3rd {{International Conference}} on {{Document Analysis}} and {{Recognition}}},
   author = {Ho, Tin Kam},
-  date = {1995-08},
+  date = {1995},
   volume = {1},
   pages = {278-282 vol.1},
   doi = {10.1109/icdar.1995.598994},
@@ -2387,7 +2429,7 @@
 @article{Honecker:2012:FieldTheoryStandard,
   title = {Towards the Field Theory of the {{Standard Model}} on Fractional {{D6}}-Branes on {{T6}}/{{Z6}}: {{Yukawa}} Couplings and Masses},
   author = {Honecker, Gabriele and Vanhoof, Joris},
-  date = {2012-01},
+  date = {2012},
   journaltitle = {Fortschritte der Physik},
   volume = {60},
   pages = {1050--1056},
@@ -2404,7 +2446,7 @@
 @article{Horowitz:1991:SingularStringSolutions,
   title = {Singular String Solutions with Nonsingular Initial Data},
   author = {Horowitz, Gary T. and Steif, Alan R.},
-  date = {1991-04},
+  date = {1991},
   journaltitle = {Physics Letters B},
   shortjournal = {Physics Letters B},
   volume = {258},
@@ -2420,7 +2462,7 @@
 @article{Horowitz:2002:InstabilitySpacelikeNull,
   title = {Instability of Spacelike and Null Orbifold Singularities},
   author = {Horowitz, Gary T. and Polchinski, Joseph},
-  date = {2002-11-25},
+  date = {2002},
   journaltitle = {Physical Review D},
   shortjournal = {Phys. Rev. D},
   volume = {66},
@@ -2457,7 +2499,7 @@
 @article{Ibanez:2001:GettingJustStandard,
   title = {Getting Just the Standard Model at Intersecting Branes},
   author = {Ibanez, Luis E. and Marchesano, Fernando and Rabadán, Raúl},
-  date = {2001-11-02},
+  date = {2001},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energy Phys.},
   volume = {2001},
@@ -2475,7 +2517,7 @@
 @book{Ibanez:2012:StringTheoryParticle,
   title = {String Theory and Particle Physics: {{An}} Introduction to String Phenomenology},
   author = {Ibanez, Luis E. and Uranga, Angel M.},
-  date = {2012-02},
+  date = {2012},
   publisher = {{Cambridge University Press}},
   file = {/home/riccardo/.local/share/zotero/files/ibanez_uranga_2012_string_theory_and_particle_physics2.pdf},
   isbn = {978-0-521-51752-2 978-1-139-22742-1}
@@ -2484,7 +2526,7 @@
 @article{Inoue:1987:NonAbelianOrbifolds,
   title = {Non-{{Abelian Orbifolds}}},
   author = {Inoue, Kenzo and Sakamoto, Makoto and Takano, Hiroshi},
-  date = {1987-10-01},
+  date = {1987},
   journaltitle = {Progress of Theoretical Physics},
   shortjournal = {Progress of Theoretical Physics},
   volume = {78},
@@ -2501,7 +2543,7 @@
 @article{Inoue:1990:StringInteractionsNonAbelian,
   title = {String {{Interactions}} on {{Non}}-{{Abelian Orbifold}}},
   author = {Inoue, Kenzo and Nima, Shuij},
-  date = {1990-10-01},
+  date = {1990},
   journaltitle = {Progress of Theoretical Physics},
   shortjournal = {Progress of Theoretical Physics},
   volume = {84},
@@ -2514,11 +2556,24 @@
   number = {4}
 }
 
+@online{Ioffe:2015:BatchNormalizationAccelerating,
+  title = {Batch {{Normalization}}: {{Accelerating Deep Network Training}} by {{Reducing Internal Covariate Shift}}},
+  shorttitle = {Batch {{Normalization}}},
+  author = {Ioffe, Sergey and Szegedy, Christian},
+  date = {2015},
+  abstract = {Training Deep Neural Networks is complicated by the fact that the distribution of each layer's inputs changes during training, as the parameters of the previous layers change. This slows down the training by requiring lower learning rates and careful parameter initialization, and makes it notoriously hard to train models with saturating nonlinearities. We refer to this phenomenon as internal covariate shift, and address the problem by normalizing layer inputs. Our method draws its strength from making normalization a part of the model architecture and performing the normalization for each training mini-batch. Batch Normalization allows us to use much higher learning rates and be less careful about initialization. It also acts as a regularizer, in some cases eliminating the need for Dropout. Applied to a state-of-the-art image classification model, Batch Normalization achieves the same accuracy with 14 times fewer training steps, and beats the original model by a significant margin. Using an ensemble of batch-normalized networks, we improve upon the best published result on ImageNet classification: reaching 4.9\% top-5 validation error (and 4.8\% test error), exceeding the accuracy of human raters.},
+  archivePrefix = {arXiv},
+  eprint = {1502.03167},
+  eprinttype = {arxiv},
+  file = {/home/riccardo/.local/share/zotero/files/ioffe_szegedy_2015_batch_normalization.pdf;/home/riccardo/.local/share/zotero/storage/L94NDAT8/1502.html},
+  keywords = {⛔ No DOI found}
+}
+
 @article{Jackiw:1992:ElectromagneticFieldsMassless,
   ids = {Jackiw:1992:ElectromagneticFieldsMasslessa},
   title = {Electromagnetic Fields of a Massless Particle and the Eikonal},
   author = {Jackiw, R. and Kabat, D. and Ortiz, M.},
-  date = {1992-02-27},
+  date = {1992},
   journaltitle = {Physics Letters B},
   shortjournal = {Physics Letters B},
   volume = {277},
@@ -2538,7 +2593,7 @@
 @article{Johnson:2000:DBranePrimer,
   title = {D-{{Brane Primer}}},
   author = {Johnson, Clifford V.},
-  date = {2000-07},
+  date = {2000},
   journaltitle = {Strings, Branes and Gravity},
   pages = {129--350},
   doi = {10.1142/9789812799630_0002},
@@ -2561,7 +2616,7 @@
 @online{Joyce:2002:LecturesCalabiYauSpecial,
   title = {Lectures on {{Calabi}}-{{Yau}} and Special {{Lagrangian}} Geometry},
   author = {Joyce, Dominic},
-  date = {2002-06},
+  date = {2002},
   abstract = {This paper gives a leisurely introduction to Calabi-Yau manifolds and special Lagrangian submanifolds from the differential geometric point of view, followed by a survey of recent results on singularities of special Lagrangian submanifolds, and their application to the SYZ Conjecture. It is aimed at graduate students in Geometry, String Theorists, and others wishing to learn the subject, and is designed to be fairly self-contained. It is based on lecture courses given at Nordfjordeid, Norway and MSRI, Berkeley in June and July 2001. We introduce Calabi-Yau m-folds via holonomy groups, Kahler geometry and the Calabi Conjecture, and special Lagrangian m-folds via calibrated geometry. `Almost Calabi-Yau m-folds' (a generalization of Calabi-Yau m-folds useful in special Lagrangian geometry) are explained and the deformation theory and moduli spaces of compact special Lagrangian submanifolds in (almost) Calabi-Yau m-folds is described. In the final part we consider isolated singularities of special Lagrangian m-folds, focussing mainly on singularities locally modelled on cones, and the expected behaviour of singularities of compact special Lagrangian m-folds in generic (almost) Calabi-Yau m-folds. String Theory, Mirror Symmetry and the SYZ Conjecture are briefly discussed, and some results of the author on singularities of special Lagrangian fibrations of Calabi-Yau 3-folds are described.},
   archivePrefix = {arXiv},
   eprint = {math/0108088},
@@ -2573,7 +2628,7 @@
 @article{Kachru:2003:SitterVacuaString,
   title = {De {{Sitter Vacua}} in {{String Theory}}},
   author = {Kachru, Shamit and Kallosh, Renata and Linde, Andrei and Trivedi, Sandip P.},
-  date = {2003-08-07},
+  date = {2003},
   journaltitle = {Physical Review D},
   shortjournal = {Phys. Rev. D},
   volume = {68},
@@ -2591,7 +2646,7 @@
 @article{Khoury:2002:BigCrunchBig,
   title = {From {{Big Crunch}} to {{Big Bang}}},
   author = {Khoury, Justin and Ovrut, Burt A. and Seiberg, Nathan and Steinhardt, Paul J. and Turok, Neil},
-  date = {2002-04-09},
+  date = {2002},
   journaltitle = {Physical Review D},
   shortjournal = {Phys. Rev. D},
   volume = {65},
@@ -2607,22 +2662,24 @@
 }
 
 @online{Kingma:2017:AdamMethodStochastica,
+  ids = {Kingma:2017:AdamMethodStochastic},
   title = {Adam: {{A Method}} for {{Stochastic Optimization}}},
   shorttitle = {Adam},
   author = {Kingma, Diederik P. and Ba, Jimmy},
-  date = {2017-01-29},
+  date = {2017},
   abstract = {We introduce Adam, an algorithm for first-order gradient-based optimization of stochastic objective functions, based on adaptive estimates of lower-order moments. The method is straightforward to implement, is computationally efficient, has little memory requirements, is invariant to diagonal rescaling of the gradients, and is well suited for problems that are large in terms of data and/or parameters. The method is also appropriate for non-stationary objectives and problems with very noisy and/or sparse gradients. The hyper-parameters have intuitive interpretations and typically require little tuning. Some connections to related algorithms, on which Adam was inspired, are discussed. We also analyze the theoretical convergence properties of the algorithm and provide a regret bound on the convergence rate that is comparable to the best known results under the online convex optimization framework. Empirical results demonstrate that Adam works well in practice and compares favorably to other stochastic optimization methods. Finally, we discuss AdaMax, a variant of Adam based on the infinity norm.},
   archivePrefix = {arXiv},
   eprint = {1412.6980},
   eprinttype = {arxiv},
-  file = {/home/riccardo/.local/share/zotero/files/kingma_ba_2017_adam2.pdf;/home/riccardo/.local/share/zotero/storage/EYEANITG/1412.html},
-  keywords = {⛔ No DOI found}
+  file = {/home/riccardo/.local/share/zotero/files/kingma_ba_2017_adam.pdf;/home/riccardo/.local/share/zotero/files/kingma_ba_2017_adam2.pdf;/home/riccardo/.local/share/zotero/storage/EYEANITG/1412.html},
+  keywords = {⛔ No DOI found},
+  version = {8}
 }
 
 @article{Kiritsis:1994:StringPropagationGravitational,
   title = {String {{Propagation}} in {{Gravitational Wave Backgrounds}}},
   author = {Kiritsis, Elias and Kounnas, Costas},
-  date = {1994-01},
+  date = {1994},
   journaltitle = {Physics Letters B},
   shortjournal = {Physics Letters B},
   volume = {320},
@@ -2640,7 +2697,7 @@
 @article{Klaewer:2019:MachineLearningLine,
   title = {Machine Learning Line Bundle Cohomologies of Hypersurfaces in Toric Varieties},
   author = {Klaewer, Daniel and Schlechter, Lorenz},
-  date = {2019-02},
+  date = {2019},
   journaltitle = {Physics Letters B},
   shortjournal = {Physics Letters B},
   volume = {789},
@@ -2655,11 +2712,10 @@
   langid = {english}
 }
 
-@article{Krefl:2017:MachineLearningCalabiYaua,
-  ids = {Krefl:2017:MachineLearningCalabiYau},
+@article{Krefl:2017:MachineLearningCalabiYau,
   title = {Machine Learning of {{Calabi}}-{{Yau}} Volumes},
   author = {Krefl, Daniel and Seong, Rak-Kyeong},
-  date = {2017-09-12},
+  date = {2017},
   journaltitle = {Physical Review D},
   shortjournal = {Phys. Rev. D},
   volume = {96},
@@ -2694,7 +2750,7 @@
 @online{Krippendorf:2010:CambridgeLecturesSupersymmetry,
   title = {Cambridge {{Lectures}} on {{Supersymmetry}} and {{Extra Dimensions}}},
   author = {Krippendorf, Sven and Quevedo, Fernando and Schlotterer, Oliver},
-  date = {2010-11-05},
+  date = {2010},
   abstract = {These lectures on supersymmetry and extra dimensions are aimed at finishing undergraduate and beginning postgraduate students with a background in quantum field theory and group theory. Basic knowledge in general relativity might be advantageous for the discussion of extra dimensions. This course was taught as a 24+1 lecture course in Part III of the Mathematical Tripos in recent years. The first six chapters give an introduction to supersymmetry in four spacetime dimensions, they fill about two thirds of the lecture notes and are in principle self-contained. The remaining two chapters are devoted to extra spacetime dimensions which are in the end combined with the concept of supersymmetry. Videos from the course lectured in 2006 can be found online at http://www.sms.cam.ac.uk/collection/659537 .},
   archivePrefix = {arXiv},
   eprint = {1011.1491},
@@ -2707,7 +2763,7 @@
 @online{Krippendorf:2020:DetectingSymmetriesNeural,
   title = {Detecting {{Symmetries}} with {{Neural Networks}}},
   author = {Krippendorf, Sven and Syvaeri, Marc},
-  date = {2020-03-30},
+  date = {2020},
   abstract = {Identifying symmetries in data sets is generally difficult, but knowledge about them is crucial for efficient data handling. Here we present a method how neural networks can be used to identify symmetries. We make extensive use of the structure in the embedding layer of the neural network which allows us to identify whether a symmetry is present and to identify orbits of the symmetry in the input. To determine which continuous or discrete symmetry group is present we analyse the invariant orbits in the input. We present examples based on rotation groups \$SO(n)\$ and the unitary group \$SU(2).\$ Further we find that this method is useful for the classification of complete intersection Calabi-Yau manifolds where it is crucial to identify discrete symmetries on the input space. For this example we present a novel data representation in terms of graphs.},
   archivePrefix = {arXiv},
   eprint = {2003.13679},
@@ -2719,7 +2775,7 @@
 @article{Krishnan:2020:MachineLearningGauged,
   title = {Machine {{Learning Gauged Supergravity}}},
   author = {Krishnan, Chethan and Mohan, Vyshnav and Ray, Soham},
-  date = {2020-05},
+  date = {2020},
   journaltitle = {Fortschritte der Physik},
   shortjournal = {Fortschr. Phys.},
   volume = {68},
@@ -2738,7 +2794,7 @@
 @article{Lerche:1987:ChiralFourdimensionalHeterotic,
   title = {Chiral Four-Dimensional Heterotic Strings from Self-Dual Lattices},
   author = {Lerche, Wolfgang and Lüst, Dieter and Schellekens, A. N.},
-  date = {1987-01},
+  date = {1987},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {287},
@@ -2769,7 +2825,7 @@
   ids = {Liu:2002:StringsTimeDependent},
   title = {Strings in a {{Time}}-{{Dependent Orbifold}}},
   author = {Liu, Hong and Moore, Gregory and Seiberg, Nathan},
-  date = {2002-06},
+  date = {2002},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energy Phys.},
   volume = {2002},
@@ -2791,7 +2847,7 @@
   ids = {Liu:2002:StringsTimeDependenta},
   title = {Strings in {{Time}}-{{Dependent Orbifolds}}},
   author = {Liu, Hong and Moore, Gregory and Seiberg, Nathan},
-  date = {2002-10},
+  date = {2002},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energy Phys.},
   volume = {2002},
@@ -2813,7 +2869,7 @@
 @article{Lust:2009:LHCStringHunter,
   title = {The {{LHC String Hunter}}'s {{Companion}}},
   author = {Lüst, Dieter and Stieberger, Stephan and Taylor, Tomasz R.},
-  date = {2009-02},
+  date = {2009},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {808},
@@ -2831,7 +2887,7 @@
 @article{Lust:2009:SeeingStringLandscape,
   title = {Seeing through the String Landscape—a String Hunter's Companion in Particle Physics and Cosmology},
   author = {Lüst, Dieter},
-  date = {2009-03-30},
+  date = {2009},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energy Phys.},
   volume = {2009},
@@ -2866,6 +2922,35 @@
   number = {2065}
 }
 
+@article{Maurer:2016:BenefitMultitaskRepresentation,
+  title = {The Benefit of Multitask Representation Learning},
+  author = {Maurer, Andreas and Pontil, Massimiliano and Romera-Paredes, Bernardino},
+  date = {2016},
+  journaltitle = {The Journal of Machine Learning Research},
+  volume = {17},
+  pages = {2853--2884},
+  publisher = {{JMLR. org}},
+  keywords = {⛔ No DOI found},
+  number = {1}
+}
+
+@article{Mehta:2019:HighbiasLowvarianceIntroduction,
+  title = {A High-Bias, Low-Variance Introduction to {{Machine Learning}} for Physicists},
+  author = {Mehta, Pankaj and Bukov, Marin and Wang, Ching-Hao and Day, Alexandre G. R. and Richardson, Clint and Fisher, Charles K. and Schwab, David J.},
+  date = {2019},
+  journaltitle = {Physics Reports},
+  shortjournal = {Physics Reports},
+  volume = {810},
+  pages = {1--124},
+  issn = {03701573},
+  doi = {10.1016/j.physrep.2019.03.001},
+  abstract = {Machine Learning (ML) is one of the most exciting and dynamic areas of modern research and application. The purpose of this review is to provide an introduction to the core concepts and tools of machine learning in a manner easily understood and intuitive to physicists. The review begins by covering fundamental concepts in ML and modern statistics such as the bias-variance tradeoff, overfitting, regularization, generalization, and gradient descent before moving on to more advanced topics in both supervised and unsupervised learning. Topics covered in the review include ensemble models, deep learning and neural networks, clustering and data visualization, energy-based models (including MaxEnt models and Restricted Boltzmann Machines), and variational methods. Throughout, we emphasize the many natural connections between ML and statistical physics. A notable aspect of the review is the use of Python Jupyter notebooks to introduce modern ML/statistical packages to readers using physics-inspired datasets (the Ising Model and Monte-Carlo simulations of supersymmetric decays of proton-proton collisions). We conclude with an extended outlook discussing possible uses of machine learning for furthering our understanding of the physical world as well as open problems in ML where physicists may be able to contribute. (Notebooks are available at https://physics.bu.edu/\textasciitilde pankajm/MLnotebooks.html )},
+  archivePrefix = {arXiv},
+  eprint = {1803.08823},
+  eprinttype = {arxiv},
+  file = {/home/riccardo/.local/share/zotero/files/mehta_et_al_2019_a_high-bias,_low-variance_introduction_to_machine_learning_for_physicists3.pdf;/home/riccardo/.local/share/zotero/storage/DVY32RS5/1803.html}
+}
+
 @inproceedings{Mockus:1975:BayesianMethodsSeeking,
   ids = {Mockus:1975:BayesianMethodsSeekinga},
   title = {On Bayesian Methods for Seeking the Extremum},
@@ -2886,7 +2971,7 @@
 @article{Mutter:2019:DeepLearningHeterotic,
   title = {Deep Learning in the Heterotic Orbifold Landscape},
   author = {Mütter, Andreas and Parr, Erik and Vaudrevange, Patrick K. S.},
-  date = {2019-03},
+  date = {2019},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {940},
@@ -2901,10 +2986,19 @@
   langid = {english}
 }
 
+@inproceedings{Ndirango:2019:GeneralizationMultitaskDeep,
+  title = {Generalization in Multitask Deep Neural Classifiers: A Statistical Physics Approach},
+  booktitle = {Advances in Neural Information Processing Systems},
+  author = {Ndirango, Anthony and Lee, Tyler},
+  date = {2019},
+  pages = {15862--15871},
+  keywords = {⛔ No DOI found}
+}
+
 @article{Nilsson:1990:GeneralNSRString,
   title = {General {{NSR}} String Reggeon Vertices from a Dual Ramond Vertex},
   author = {Nilsson, Bengt E. W. and Tollstén, Anna K.},
-  date = {1990-04},
+  date = {1990},
   journaltitle = {Physics Letters B},
   shortjournal = {Physics Letters B},
   volume = {240},
@@ -2919,7 +3013,7 @@
 @article{Otsuka:2020:DeepLearningKmeans,
   title = {Deep Learning and K-Means Clustering in Heterotic String Vacua with Line Bundles},
   author = {Otsuka, Hajime and Takemoto, Kenta},
-  date = {2020-05},
+  date = {2020},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energ. Phys.},
   volume = {2020},
@@ -2937,7 +3031,7 @@
 @article{Parr:2020:ContrastDataMining,
   title = {Contrast Data Mining for the {{MSSM}} from Strings},
   author = {Parr, Erik and Vaudrevange, Patrick K.S.},
-  date = {2020-03},
+  date = {2020},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {952},
@@ -2955,7 +3049,7 @@
 @article{Parr:2020:PredictingOrbifoldOrigin,
   title = {Predicting the {{Orbifold Origin}} of the {{MSSM}}},
   author = {Parr, Erik and Vaudrevange, Patrick K. S. and Wimmer, Martin},
-  date = {2020-05},
+  date = {2020},
   journaltitle = {Fortschritte der Physik},
   shortjournal = {Fortschr. Phys.},
   volume = {68},
@@ -2974,7 +3068,7 @@
 @article{Paton:1969:GeneralizedVenezianoModel,
   title = {Generalized {{Veneziano}} Model with Isospin},
   author = {Paton, Jack E. and {Chan Hong-Mo}},
-  date = {1969-05},
+  date = {1969},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {10},
@@ -3001,7 +3095,7 @@
 }
 
 @inproceedings{Peng:2017:LargeKernelMattersa,
-  title = {Large Kernel {{Matters}}–{{Improve}} Semantic Segmentation by Global Convolutional Network},
+  title = {Large Kernel Matters. {{Improve}} Semantic Segmentation by Global Convolutional Network},
   booktitle = {Proceedings of the {{IEEE}} Conference on Computer Vision and Pattern Recognition},
   author = {Peng, Chao and Zhang, Xiangyu and Yu, Gang and Luo, Guiming and Sun, Jian},
   date = {2017},
@@ -3013,7 +3107,7 @@
 @article{Pesando:2008:MultibranesBoundaryStates,
   title = {Multi-Branes Boundary States with Open String Interactions},
   author = {Pesando, Igor},
-  date = {2008-04},
+  date = {2008},
   journaltitle = {Nuclear Physics B},
   volume = {793},
   pages = {211--245},
@@ -3028,7 +3122,7 @@
 @article{Pesando:2010:OpenClosedString,
   title = {Open and {{Closed String Vertices}} for Branes with Magnetic Field and {{T}}-Duality},
   author = {Pesando, Igor},
-  date = {2010-02},
+  date = {2010},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energ. Phys.},
   volume = {2010},
@@ -3046,7 +3140,7 @@
 @online{Pesando:2011:GeneratingFunctionAmplitudes,
   title = {The Generating Function of Amplitudes with {{N}} Twisted and {{M}} Untwisted States},
   author = {Pesando, Igor},
-  date = {2011-07-27},
+  date = {2011},
   abstract = {We show that the generating function of all amplitudes with N twisted and M untwisted states, i.e. the Reggeon vertex for magnetized branes on R\^2 can be computed once the correlator of N non excited twisted states and the corresponding Green function are known and we give an explicit expression as a functional of the these objects},
   archivePrefix = {arXiv},
   eprint = {1107.5525},
@@ -3059,7 +3153,7 @@
 @article{Pesando:2011:StringsArbitraryConstant,
   title = {Strings in an Arbitrary Constant Magnetic Field with Arbitrary Constant Metric and Stringy Form Factors},
   author = {Pesando, Igor},
-  date = {2011-06},
+  date = {2011},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energ. Phys.},
   volume = {2011},
@@ -3078,7 +3172,7 @@
   ids = {Pesando:2013:GreenFunctionsTwist},
   title = {Green Functions and Twist Correlators for {{N}} Branes at Angles},
   author = {Pesando, Igor},
-  date = {2012-06},
+  date = {2012},
   journaltitle = {Nuclear Physics B},
   volume = {866},
   pages = {87--123},
@@ -3098,7 +3192,7 @@
 @article{Pesando:2013:LightConeQuantization,
   title = {Light Cone Quantization and Interactions of a New Closed Bosonic String Inspired to {{D1}} String},
   author = {Pesando, Igor},
-  date = {2013-11},
+  date = {2013},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {876},
@@ -3116,7 +3210,7 @@
 @article{Pesando:2014:CanonicalQuantizationString,
   title = {Canonical Quantization of a String Describing {{N}} Branes at Angles},
   author = {Pesando, Igor},
-  date = {2014-12},
+  date = {2014},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {889},
@@ -3134,7 +3228,7 @@
   ids = {Pesando:2014:CorrelatorsArbitraryUntwisteda},
   title = {Correlators of Arbitrary Untwisted Operators and Excited Twist Operators for {{N}} Branes at Angles},
   author = {Pesando, Igor},
-  date = {2014-01},
+  date = {2014},
   journaltitle = {Nuclear Physics B},
   volume = {886},
   pages = {243--287},
@@ -3153,7 +3247,7 @@
   ids = {Pesando:2016:FullyStringyComputationa},
   title = {Towards a Fully Stringy Computation of {{Yukawa}} Couplings on Non-Factorized Tori and Non-Abelian Twist Correlators ({{I}}): {{The}} Classical Solution and Action},
   author = {Pesando, Igor},
-  date = {2016-09},
+  date = {2016},
   journaltitle = {Nuclear Physics B},
   volume = {910},
   pages = {618--664},
@@ -3171,7 +3265,7 @@
 @article{Petersen:1989:CovariantSuperreggeonCalculus,
   title = {Covariant Super-Reggeon Calculus for Superstrings},
   author = {Petersen, Jens L. and Sidenius, J. R. and Tollsten, A. K.},
-  date = {1989-04},
+  date = {1989},
   journaltitle = {Nuclear Physics B},
   shortjournal = {Nuclear Physics B},
   volume = {317},
@@ -3186,7 +3280,7 @@
 @article{Polchinski:1995:DirichletBranesRamondRamond,
   title = {Dirichlet Branes and {{Ramond}}-{{Ramond}} Charges},
   author = {Polchinski, Joseph},
-  date = {1995-12},
+  date = {1995},
   journaltitle = {Physical Review Letters},
   volume = {75},
   pages = {4724--4727},
@@ -3203,7 +3297,7 @@
 @article{Polchinski:1996:TASILecturesDBranes,
   title = {{{TASI Lectures}} on {{D}}-{{Branes}}},
   author = {Polchinski, Joseph},
-  date = {1996-11},
+  date = {1996},
   journaltitle = {New Frontiers in Fields and Strings},
   pages = {75--136},
   abstract = {This is an introduction to the properties of D-branes, topological defects in string theory on which string endpoints can live. D-branes provide a simple description of various nonperturbative objects required by string duality, and give new insight into the quantum mechanics of black holes and the nature of spacetime at the shortest distances. The first two thirds of these lectures closely follow the earlier ITP lectures hep-th/9602052, written with S. Chaudhuri and C. Johnson. The final third includes more extensive applications to string duality.},
@@ -3241,7 +3335,7 @@
 @article{Polyakov:1981:QuantumGeometryBosonic,
   title = {Quantum Geometry of Bosonic Strings},
   author = {Polyakov, Alexander M.},
-  date = {1981-07},
+  date = {1981},
   journaltitle = {Physics Letters B},
   shortjournal = {Physics Letters B},
   volume = {103},
@@ -3258,7 +3352,7 @@
   ids = {Quinlan:1986:InductionDecisionTreesa},
   title = {Induction of Decision Trees},
   author = {Quinlan, John R.},
-  date = {1986-03-01},
+  date = {1986},
   journaltitle = {Machine Learning},
   shortjournal = {Mach Learn},
   volume = {1},
@@ -3288,7 +3382,7 @@
 @article{Ruehle:2017:EvolvingNeuralNetworks,
   title = {Evolving Neural Networks with Genetic Algorithms to Study the String Landscape},
   author = {Ruehle, Fabian},
-  date = {2017-08},
+  date = {2017},
   journaltitle = {Journal of High Energy Physics},
   shortjournal = {J. High Energ. Phys.},
   volume = {2017},
@@ -3306,7 +3400,7 @@
 @article{Ruehle:2020:DataScienceApplications,
   title = {Data Science Applications to String Theory},
   author = {Ruehle, Fabian},
-  date = {2020-01},
+  date = {2020},
   journaltitle = {Physics Reports},
   shortjournal = {Physics Reports},
   volume = {839},
@@ -3333,7 +3427,7 @@
 @online{Schellekens:2017:BigNumbersString,
   title = {Big {{Numbers}} in {{String Theory}}},
   author = {Schellekens, A. N.},
-  date = {2017-06-27},
+  date = {2017},
   abstract = {This paper contains some personal reflections on several computational contributions to what is now known as the "String Theory Landscape". It consists of two parts. The first part concerns the origin of big numbers, and especially the number \$10\^\{1500\}\$ that appeared in work on the covariant lattice construction (with W. Lerche and D. Luest). This part contains some new results. I correct a huge but inconsequential error, discuss some more accurate estimates, and compare with the counting for free fermion constructions. In particular I prove that the latter only provide an exponentially small fraction of all even self-dual lattices for large lattice dimensions. The second part of the paper concerns dealing with big numbers, and contains some lessons learned from various vacuum scanning projects.},
   archivePrefix = {arXiv},
   eprint = {1601.02462},
@@ -3346,7 +3440,7 @@
 @article{Schwarz:1973:EvaluationDualFermion,
   title = {Evaluation of Dual Fermion Amplitudes},
   author = {Schwarz, John H. and Wu, C. C.},
-  date = {1973-12},
+  date = {1973},
   journaltitle = {Physics Letters B},
   shortjournal = {Physics Letters B},
   volume = {47},
@@ -3361,7 +3455,7 @@
 @article{Sciuto:1969:GeneralVertexFunction,
   title = {The General Vertex Function in Dual Resonance Models},
   author = {Sciuto, Stefano},
-  date = {1969-09},
+  date = {1969},
   journaltitle = {Lettere al Nuovo Cimento},
   shortjournal = {Lett. Nuovo Cimento},
   volume = {2},
@@ -3373,11 +3467,24 @@
   number = {9}
 }
 
+@article{Shahriari:2015:TakingHumanOut,
+  title = {Taking the Human out of the Loop: {{A}} Review of {{Bayesian}} Optimization},
+  author = {Shahriari, Bobak and Swersky, Kevin and Wang, Ziyu and Adams, Ryan P and De Freitas, Nando},
+  date = {2015},
+  journaltitle = {Proceedings of the IEEE},
+  volume = {104},
+  pages = {148--175},
+  publisher = {{IEEE}},
+  doi = {10.1109/JPROC.2015.2494218},
+  file = {/home/riccardo/.local/share/zotero/files/shahriari_et_al_2015_taking_the_human_out_of_the_loop.pdf},
+  number = {1}
+}
+
 @article{Sheikh-Jabbari:1998:ClassificationDifferentBranes,
   ids = {SheikhJabbari:1998:ClassificationDifferentBranes},
   title = {Classification of {{Different Branes}} at {{Angles}}},
   author = {Sheikh-Jabbari, Mohammad M.},
-  date = {1998-02},
+  date = {1998},
   journaltitle = {Physics Letters B},
   shortjournal = {Physics Letters B},
   volume = {420},
@@ -3400,10 +3507,19 @@
   file = {/home/riccardo/.local/share/zotero/files/skiena_2017_the_data_science_design_manual.pdf}
 }
 
+@inproceedings{Snoek:2012:PracticalBayesianOptimization,
+  title = {Practical Bayesian Optimization of Machine Learning Algorithms},
+  booktitle = {Advances in Neural Information Processing Systems},
+  author = {Snoek, Jasper and Larochelle, Hugo and Adams, Ryan P.},
+  date = {2012},
+  pages = {2951--2959},
+  keywords = {⛔ No DOI found}
+}
+
 @article{Soldate:1987:PartialwaveUnitarityClosedstring,
   title = {Partial-Wave Unitarity and Closed-String Amplitudes},
   author = {Soldate, Mark},
-  date = {1987-03},
+  date = {1987},
   journaltitle = {Physics Letters B},
   shortjournal = {Physics Letters B},
   volume = {186},
@@ -3434,7 +3550,7 @@
   title = {Yukawa {{Couplings}} for {{Bosonic Z}}{{{\textsubscript{N}}}} {{Orbifolds}}: {{Their Moduli}} and {{Twisted Sector Dependence}}},
   shorttitle = {Yukawa {{Couplings}} for {{Bosonic}} \${{Z}}\_{{N}}\$ {{Orbifolds}}},
   author = {Stieberger, Stephan and Jungnickel, Dirk-U. and Lauer, Juergen and Spaliński, Michał},
-  date = {1992-10-30},
+  date = {1992},
   journaltitle = {Modern Physics Letters A},
   shortjournal = {Mod. Phys. Lett. A},
   volume = {07},
@@ -3452,7 +3568,7 @@
 @online{Susskind:2003:AnthropicLandscapeString,
   title = {The {{Anthropic Landscape}} of {{String Theory}}},
   author = {Susskind, Leonard},
-  date = {2003-02},
+  date = {2003},
   abstract = {In this lecture I make some educated guesses, about the landscape of string theory vacua. Based on the recent work of a number of authors, it seems plausible that the lanscape is unimaginably large and diverse. Whether we like it or not, this is the kind of behavior that gives credence to the Anthropic Principle. I discuss the theoretical and conceptual issues that arise in developing a cosmology based on the diversity of environments implicit in string theory.},
   archivePrefix = {arXiv},
   eprint = {hep-th/0302219},
@@ -3461,20 +3577,39 @@
   keywords = {⛔ No DOI found}
 }
 
-@inproceedings{Szegedy:2014:GoingDeeperConvolutions,
-  ids = {Szegedy:2015:GoingDeeperConvolutions},
-  title = {Going {{Deeper}} with {{Convolutions}}},
+@inproceedings{Szegedy:2015:GoingDeeperConvolutions,
+  title = {Going Deeper with Convolutions},
+  booktitle = {Proceedings of the {{IEEE}} Conference on Computer Vision and Pattern Recognition},
   author = {Szegedy, Christian and Liu, Wei and Jia, Yangqing and Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich, Andrew},
-  date = {2014-09-16},
-  url = {http://arxiv.org/abs/1409.4842},
-  urldate = {2020-05-17},
+  date = {2015},
+  pages = {1--9},
   abstract = {We propose a deep convolutional neural network architecture codenamed "Inception", which was responsible for setting the new state of the art for classification and detection in the ImageNet Large-Scale Visual Recognition Challenge 2014 (ILSVRC 2014). The main hallmark of this architecture is the improved utilization of the computing resources inside the network. This was achieved by a carefully crafted design that allows for increasing the depth and width of the network while keeping the computational budget constant. To optimize quality, the architectural decisions were based on the Hebbian principle and the intuition of multi-scale processing. One particular incarnation used in our submission for ILSVRC 2014 is called GoogLeNet, a 22 layers deep network, the quality of which is assessed in the context of classification and detection.},
   archivePrefix = {arXiv},
   eprint = {1409.4842},
   eprinttype = {arxiv},
-  file = {/home/riccardo/.local/share/zotero/files/szegedy_et_al_2014_going_deeper_with_convolutions.pdf;/home/riccardo/.local/share/zotero/storage/5QNPUTZM/1409.html},
-  keywords = {⛔ No DOI found},
-  primaryClass = {cs}
+  keywords = {⛔ No DOI found}
+}
+
+@online{Szegedy:2016:Inceptionv4InceptionresnetImpact,
+  title = {Inception-v4, Inception-Resnet and the Impact of Residual Connections on Learning},
+  author = {Szegedy, Christian and Ioffe, Sergey and Vanhoucke, Vincent and Alemi, Alex},
+  date = {2016},
+  archivePrefix = {arXiv},
+  eprint = {1602.07261},
+  eprinttype = {arxiv},
+  keywords = {⛔ No DOI found}
+}
+
+@inproceedings{Szegedy:2016:RethinkingInceptionArchitecture,
+  title = {Rethinking the Inception Architecture for Computer Vision},
+  booktitle = {Proceedings of the {{IEEE}} Conference on Computer Vision and Pattern Recognition},
+  author = {Szegedy, Christian and Vanhoucke, Vincent and Ioffe, Sergey and Shlens, Jon and Wojna, Zbigniew},
+  date = {2016},
+  pages = {2818--2826},
+  archivePrefix = {arXiv},
+  eprint = {1512.00567},
+  eprinttype = {arxiv},
+  keywords = {⛔ No DOI found}
 }
 
 @article{Tan:2019:DeepLearningHolographic,
@@ -3601,6 +3736,15 @@
   options = {useprefix=true}
 }
 
+@inproceedings{Thrun:1996:LearningNthThing,
+  title = {Is Learning the N-Th Thing Any Easier than Learning the First?},
+  booktitle = {Advances in Neural Information Processing Systems},
+  author = {Thrun, Sebastian},
+  date = {1996},
+  pages = {640--646},
+  keywords = {⛔ No DOI found}
+}
+
 @inproceedings{Tompson:2015:EfficientObjectLocalization,
   title = {Efficient {{Object Localization Using Convolutional Networks}}},
   author = {Tompson, Jonathan and Goroshin, Ross and Jain, Arjun and LeCun, Yann and Bregler, Christopher},
diff --git a/thesis.tex b/thesis.tex
index 1073818..bdc056c 100644
--- a/thesis.tex
+++ b/thesis.tex
@@ -27,6 +27,8 @@
 \newcommand{\ml}{\textsc{ml}\xspace}
 \newcommand{\nn}{\textsc{nn}\xspace}
 \newcommand{\eda}{\textsc{eda}\xspace}
+\newcommand{\pca}{\textsc{pca}\xspace}
+\newcommand{\svm}{\textsc{svm}\xspace}
 \newcommand{\bo}{\textsc{bo}\xspace}
 \newcommand{\nbo}{\textsc{nbo}\xspace}
 \newcommand{\gnbo}{\textsc{gnbo}\xspace}
@@ -142,6 +144,8 @@
 \label{part:deeplearning}
 \section{Introduction}
 \input{sec/part3/introduction.tex}
+\section{Machine and Deep Learning for CICY Manifolds}
+\input{sec/part3/ml.tex}
 
 %---- APPENDIX
 \thesispart{Appendix}