%%%%%%%%%%%%%%%%% DO NOT CHANGE HERE %%%%%%%%%%%%%%%%%%%% {
\documentclass[12pt,letterpaper]{article}
\usepackage{fullpage}
\usepackage[top=2cm, bottom=4.5cm, left=2.5cm, right=2.5cm]{geometry}
\usepackage{amsmath,amsthm,amsfonts,amssymb,amscd}
\usepackage{lastpage}
\usepackage{enumerate}
\usepackage{fancyhdr}
\usepackage{mathrsfs}
\usepackage{xcolor}
\usepackage{graphicx}
\usepackage{listings}
\usepackage{hyperref}
\usepackage{todonotes}[disable]
\usepackage{esvect}
\usepackage{bbm}
\usepackage{cases}
\hypersetup{%
  colorlinks=true,
  linkcolor=blue,
  linkbordercolor={0 0 1}
}

\setlength{\parindent}{0.0in}
\setlength{\parskip}{0.05in}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% }

%%%%%%%%%%%%%%%%%%%%%%%% CHANGE HERE %%%%%%%%%%%%%%%%%%%% {
\newcommand\course{CMPT 727}
\newcommand\semester{Spring 2022}
\newcommand\hwnumber{6}                 % <-- ASSIGNMENT #
%\newcommand\NetIDa{Your Name}           % <-- YOUR NAME
%\newcommand\NetIDb{200XXYYZZ}           % <-- STUDENT ID #
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% }

%%%%%%%%%%%%%%%%% DO NOT CHANGE HERE %%%%%%%%%%%%%%%%%%%% {
\pagestyle{fancyplain}
\headheight 35pt
\chead{\textbf{\Large Assignment \hwnumber}}
\rhead{\course \\ \semester}
\lfoot{}
\cfoot{}
\rfoot{\small\thepage}
\headsep 1.5em
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% }

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\begin{document}

\section*{Problem 1}

The Categorical distribution is defined as \[\text{Cat}(y|\Vec{\theta} ) = \prod_{c=1}^C \theta_c^{\mathbbm{1} (y=c)},\] for $y \in \{1,2,\dots , C\}$ where $C$ is the number of labels and $C>1$. We have observed $N$ observations of $Y$, with $N_k$ observation of each label. Suppose we decide to place a (unusual) zero-mean, identity-covariance Gaussian prior on $\Vec{\theta}$, $p(\Vec{\theta}) \propto \exp(-\theta^\top \theta)$.

You would like to find the MAP of $\Vec{\theta}$.

\begin{enumerate}
    \item Form the Lagrangian expression $\mathcal{L}(\Vec{\theta}, \lambda)$.
    \item Using your answer in Part 1., calculate the partial derivative with respect to each $\theta_k$ and $\lambda$.
    \item Briefly describe how to use your answer to Part 2 to find the MAP for each $\theta_k$. (You do not need to find an explicit solution.)
\end{enumerate}



\section*{Problem 2}
Consider a quadratic objective function on $\mathbf{R}^2$ \[ f(x) = \frac{1}{2}(x_1^2 + \gamma x_2^2), \]

where $\gamma >0$.
We would like to apply the gradient descent method with exact line search, starting at the point $\Vec{x}^{(0)} = (\gamma,1)$.

\begin{enumerate}
\item Derive the exact line search update.



\item Suppose $\gamma = 10$, what is the value of $\Vec{x}^{(3)}$?



\end{enumerate}


\section*{Problem 3}
In many classification problems one has the option either of assigning $x$ to class $j$
or, if you are too uncertain, of choosing a ``reject'' option. If the cost for rejects is less than the cost of
falsely classifying the object, it may be the optimal action. Let $\alpha_i$ mean you choose action $i$, for $i = 1 : C +1$,
where $C$ is the number of classes and $C + 1$ is the reject action. Let $Y = j$ be the true (but unknown) state
of nature. Define the loss function as follows
\begin{numcases}{\lambda(\alpha_i | Y = j)=}
\nonumber 0 & \text{ if $i=j$ and $i,j \in \{ 1, \dots, C \}$} \\
\nonumber \lambda_r & \text{ if $i = C+1$}\\
\nonumber \lambda_s, & \text{ otherwise}
\end{numcases}

In other words, you incur 0 loss if you correctly classify, you incur $\lambda_r$ loss (cost) if you choose the reject option,
and you incur $\lambda_s$ loss (cost) if you make a substitution error (misclassification). Both  $\lambda_r$ and $\lambda_s$ are positive numbers.

\begin{enumerate}
    \item Show that when we decide to choose a class (and not reject), we always pick the most probable one.


    \item Show that the minimum risk is obtained if we decide to pick the most probable class $j_{max} = \argmax_j p(Y=j|x)$ and if $p(Y = j_{max}|x) \geq 1 - \frac{\lambda_r}{\lambda_s} $ ; otherwise we decide to reject.





    \item Describe qualitatively what happens as $\lambda_r/\lambda_s$ is increased from $0$ to $1$ (i.e., the relative cost of rejection
increases).



\end{enumerate}



\section*{Problem 4}
Please write one thing from this course you found confusing, a topic you would like to hear more about, or something you found particularly interesting.




\end{document}