Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Introduce Ftrace in the syscall section #180

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
Binary file added assets/syscall/flow.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/syscall/kernel-livepatching1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/syscall/kernel-livepatching2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions examples/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ obj-m += kbleds.o
obj-m += sched.o
obj-m += chardev2.o
obj-m += syscall.o
obj-m += syscall-ftrace.o
obj-m += intrpt.o
obj-m += cryptosha256.o
obj-m += cryptosk.o
Expand Down
227 changes: 227 additions & 0 deletions examples/syscall-ftrace.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
/**
* syscall-ftrace.c
*
* System call "stealing" with ftrace
*
* We create a callback function that contains
* an unconditional jump to our spying function,
* which will then return control to the original one.
*
* The callback function is triggered by ftrace.
*/

#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/version.h>
#include <linux/unistd.h>
#include <linux/kprobes.h>
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
#include <linux/ftrace.h>

MODULE_LICENSE("GPL");

#define MAX_FILENAME_SIZE 200

#undef pr_fmt
#define pr_fmt(fmt) "[syscall-ftrace] " fmt

/** UID we want to spy on - will be filled from the command line. */
static int uid = 0;
module_param(uid, int, 0644);

/**
* This is a housekeeping structure that saves all information
* needed for hooking. Usage with `PREPARE_HOOK` is recommended.
*
* Example:
* static ftrace_hook_t sys_clone_hook =
* PREPARE_HOOK(__NR_clone, my_sys_clone, &orig_sys_clone)
*/
typedef struct ftrace_hook {
unsigned long nr; // syscall number from unistd.h
void *new; // hook function
void *orig; // original function

unsigned long address; // address to the original function
struct ftrace_ops ops; // ftrace structure
} ftrace_hook_t;

#define PREPARE_HOOK(_nr, _hook, _orig) \
{ \
.nr = (_nr), .new = (_hook), .orig = (_orig) \
}

static unsigned long **sys_call_table;

/**
* For the sake of simplicity, only the kprobe method is included.
* If you want to know more about different methods to get
* kallsyms_lookup_name, see syscall.c.
*/
static int resolve_address(ftrace_hook_t *hook)
{
static struct kprobe kp = { .symbol_name = "kallsyms_lookup_name" };
unsigned long (*kallsyms_lookup_name)(const char *name);
linD026 marked this conversation as resolved.
Show resolved Hide resolved

register_kprobe(&kp);
kallsyms_lookup_name = (unsigned long (*)(const char *))kp.addr;
unregister_kprobe(&kp);

if (!kallsyms_lookup_name) {
pr_err("kallsyms_lookup_name is not found!\n");
return -1;
}
pr_info("kallsyms_lookup_name is found at 0x%lx\n",
(unsigned long)kallsyms_lookup_name);

sys_call_table = (unsigned long **)kallsyms_lookup_name("sys_call_table");
if (!sys_call_table) {
pr_err("sys_call_table is not found!\n");
return -1;
}
pr_info("sys_call_table is found at 0x%lx\n",
(unsigned long)sys_call_table);

hook->address = (unsigned long)sys_call_table[hook->nr];
*((unsigned long *)hook->orig) = hook->address;
return 0;
}

/**
* This is where the magic happens.
*
* We check whether this function is called by the kernel or this module
* by checking whether parent_ip is within this module.
*
* During the first call, parent_ip points to somewhere in the kernel
* that's not in this module,
* while the second call is in this module
* since it's called from our_sys_openat.
*
* If it is the first call, we modify ip to be our_sys_openat,
* which will pass control to it after ftrace is done.
*/
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 11, 0)
static void notrace ftrace_thunk(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops,
struct ftrace_regs *fregs)
{
ftrace_hook_t *hook = container_of(ops, ftrace_hook_t, ops);

if (!within_module(parent_ip, THIS_MODULE))
fregs->regs.ip = (unsigned long)hook->new;
}

#else /** Version < v5.11 */
linD026 marked this conversation as resolved.
Show resolved Hide resolved
static void notrace ftrace_thunk(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct pt_regs *regs)
{
ftrace_hook_t *hook = container_of(ops, ftrace_hook_t, ops);

if (!within_module(parent_ip, THIS_MODULE))
regs->ip = (unsigned long)hook->new;
}

#endif /** Version >= v5.11 */

static int install_hook(ftrace_hook_t *hook)
{
int err;

err = resolve_address(hook);
if (err)
return err;

/** The callback function */
hook->ops.func = ftrace_thunk;
/** We need registers and we're modifying ip */
hook->ops.flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY;
/** Only sys_openat should be traced */
err = ftrace_set_filter_ip(&hook->ops, hook->address, 0, 0);
if (err) {
pr_err("ftrace_set_filter_ip() failed: %d\n", err);
return err;
}

err = register_ftrace_function(&hook->ops);
if (err) {
pr_err("register_ftrace_function() failed: %d\n", err);
return err;
}

return 0;
}

static void remove_hook(ftrace_hook_t *hook)
{
int err;

err = unregister_ftrace_function(&hook->ops);
if (err)
pr_err("unregister_ftrace_function() failed: %d\n", err);

/** Disable the trace by setting remove to 1 */
err = ftrace_set_filter_ip(&hook->ops, hook->address, 1, 0);
if (err)
pr_err("ftrace_set_filter_ip() failed: %d\n", err);
}

/** For some reason the kernel segfaults when the parameters are expanded. */
static asmlinkage long (*original_call)(struct pt_regs *regs);
static asmlinkage long our_sys_openat(struct pt_regs *regs)
{
char *kfilename;
int errcode = 0;

if (current->cred->uid.val != uid)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still have the warning.

/home/runner/work/lkmpg/lkmpg/examples/syscall-ftrace.c:180:16: warning: dereference of noderef expression

return original_call(regs);
kfilename = kmalloc(MAX_FILENAME_SIZE * sizeof(char), GFP_KERNEL);
if (!kfilename)
return original_call(regs);

/**
* This may only work in x86_64 because getting parameters
* from CPU registers is architecture-dependent.
*
* Change regs->si to appropriate registers
* if you are trying on different architecture.
*/
errcode =
copy_from_user(kfilename, (char __user *)regs->si, MAX_FILENAME_SIZE);
if (errcode < 0) {
kfree(kfilename);
return original_call(regs);
}

pr_info("File opened by UID %d: %s\n", uid, kfilename);
kfree(kfilename);

return original_call(regs);
}

static ftrace_hook_t sys_openat_hook =
PREPARE_HOOK(__NR_openat, our_sys_openat, &original_call);

static int __init syscall_ftrace_start(void)
{
int err;

err = install_hook(&sys_openat_hook);
if (err)
return err;
pr_info("hooked, spying on UID %d\n", uid);
return 0;
}

static void __exit syscall_ftrace_end(void)
{
remove_hook(&sys_openat_hook);
pr_info("removed\n");
}

module_init(syscall_ftrace_start);
module_exit(syscall_ftrace_end);
96 changes: 96 additions & 0 deletions lkmpg.tex
Original file line number Diff line number Diff line change
Expand Up @@ -1554,6 +1554,102 @@ \section{System Calls}

\samplec{examples/syscall.c}

Another technique we can utilize to control the flow of execution of a syscall is \verb|ftrace|.
It is an internal tracer designed to help out developers and designers of systems to find what is going on inside the kernel.
It can be used for debugging or analyzing latencies and performance issues that take place outside of user-space.
It is usually used as an event tracer by attaching callbacks to the beginning of functions in order to record and trace the flow of the kernel.

\begin{code}
struct ftrace_ops {
ftrace_func_t func; // callback function
unsigned long flags; // ftrace flags
void* private; // any private data
};
void callback_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct pt_regs *regs);
\end{code}

Above is the basic structure of the callback function, where the parameters are

\begin{itemize}
\item \cpp|ip|: The instruction pointer of the function being traced.
\item \cpp|parent_ip|: The instruction pointer of the caller of the traced function.
\item \cpp|ops|: A pointer to \cpp|ftrace_ops| that was used to register the callback.
\item \cpp|regs|: If \cpp|FTRACE_OPS_FL_SAVE_REGS| or \cpp|FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED| are set in the \cpp|ftrace_ops| structure,
then this will be pointing to the \cpp|pt_regs| structure like it would be if an breakpoint was placed at the start of the function where \verb|ftrace| was tracing for CPU register access.
Otherwise it either contains garbage, or \cpp|NULL|.
Do notice that in kernel version later than v5.11, this is replaced with \cpp|struct ftrace_regs *fregs|, with the original \cpp|pt_regs| accessible by \cpp|fregs->regs|.
\end{itemize}

Internally, there's a 5-byte \cpp|call| to \cpp|__fentry__| at the beginning (BEFORE function prologue) of a traceable kernel function, which is converted to \cpp|nop| during boot to prevent overhead. When a trace is registered, it is changed back to \cpp|__fentry__| and the registered callback will be executed accordingly.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Separate the line with sentences.


But callbacks can do more.
Since it's called at the start of a function,
and we have access to CPU registers,
maybe we can ``hijack'' the traced function by modifying the instruction pointer?
Yes, this is possible by enabling \cpp|FTRACE_OPS_FL_IPMODIFY| flag when registering a trace.
It will allow us to modify the instruction pointer register, which will become an unconditional jump after the \verb|ftrace| function.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The clause introduced by which is restrictive. So, omit the comma.

Note that while there can be multiple tracers on one function, only one tracer that changes \cpp|ip| can be registered at the same time.

Figure~\ref{img:ftrace-hooking-example} gives an example of auditing \cpp|sys_execve| by hooking it using \verb|ftrace|.
The callback function (\cpp|fh_ftrace_thunk|) checks whether the call is from the kernel or the module,
and passes control accordingly.
If the call is from the kernel, our auditing function is called.
Otherwise, nothing happens.
The check is important because we're only ``decorating'' the original syscall.
Our auditing function contains call to the original \cpp|sys_execve|,
which will trigger the callback function again.
It'll be an infinite loop if there's no check performed.

\begin{figure}[h]
\centering
\includegraphics[width=\textwidth]{assets/syscall/flow.jpg}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Utilize TikZ for drawing. See https://texample.net/tikz/examples/pgf-umlsd/
Avoid putting bitmap files.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image

This is the best I got so far. Unfortunately pgf-umlsd doesn't support returning to functions other than the caller, so the hooking part isn't accurately represented.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I have to rotate the figure by 90 degrees to minimize the overflow although it overflows anyway. Will it affect the output of the website or should I rotate it back and let it overflow?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively, you can just render the partial sequences.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think I can rework this diagram in latex because it lacks features I need, like returning to functions other than caller (this is the most important one), annotation. (I'm not the creator of this sequence diagram so obtaining the "original" file is not possible either.)

I think the resolution of that jpeg is good enough for even printing, so I think I may left it as is.

The latex code I've written and the result
\begin{sequencediagram}
  \newthread{do_syscall_64}{do\_syscall\_64}
  \newinst[1.5]{sys_execve}{sys\_execve}
  \newinst[1.5]{ftrace}{[ftrace]}
  \newinst[1]{fh_ftrace_thunk}{fh\_ftrace\_thunk}
  \newinst[1]{fh_sys_execve}{fh\_sys\_execve}

      \postlevel \postlevel \postlevel
      \begin{call}{do_syscall_64}{\shortstack{
        \cpp|regs-ax=|\\
        \cpp|sys_call_table[nr]|\\
        \cpp|(regs->di,regs->si|\\
        \cpp|regs->dx,regs->r10|\\
        \cpp|regs->r8,regs->r9)|
      }}{sys_execve}{}
        \begin{call}{sys_execve}{call \cpp|__fentry__|}{ftrace}{}
          \begin{call}{ftrace}{}{fh_ftrace_thunk}{}
            \postlevel
          \end{call}
        \end{call}
        \begin{call}{sys_execve}{hooking}{fh_sys_execve}{\cpp|real_sys_execve()|}
          \postlevel
        \end{call}
        \postlevel
        \begin{call}{sys_execve}{call \cpp|__fentry__|}{ftrace}{}
          \begin{call}{ftrace}{}{fh_ftrace_thunk}{}
            \postlevel
          \end{call}
        \end{call}
        \begin{call}{sys_execve}{}{fh_sys_execve}{}
        \end{call}
      \end{call}
\end{sequencediagram}

image

\caption{Linux kernel hooking with ftrace \href{https://www.apriorit.com/dev-blog/546-hooking-linux-functions-2}{Source}}
\label{img:ftrace-hooking-example}
\end{figure}

In fact, this is what live kernel patches uses.
By redirecting the flow of execution,
end users can use patched functions instead of vulnerable ones without reboot, as figure~\ref{img:kernel-livepatching} shows.

\begin{figure}[h]
\centering
\includegraphics[width=\textwidth]{assets/syscall/kernel-livepatching1.png}\\
\vspace{1cm}
\includegraphics[width=\textwidth]{assets/syscall/kernel-livepatching2.png}
\caption{How live kernel patching works. \href{https://ubuntu.com/blog/an-overview-of-live-kernel-patching}{Source}}
\label{img:kernel-livepatching}
\end{figure}

For more information regarding \verb|ftrace|, check out \href{https://www.kernel.org/doc/html/latest/trace/ftrace.html}{the kernel documentation} and \href{https://youtu.be/93uE_kWWQjs}{this talk from Steven Rostedt}.

Before getting our hands dirty, here are some functions we need to know.

\begin{itemize}
\item \cpp|register_ftrace_function(struct ftrace_ops *ops)|: Enable tracing call defined by \cpp|ops|
\item \cpp|unregister_ftrace_function(struct ftrace_ops *ops)|: Disable tracing call defined by \cpp|ops|
\item \cpp|ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset)|: Denote which function should be enabled for tracing by its name. If \cpp|buf| is \cpp|NULL|, all functions will be enabled.
\item \cpp|ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, int remove, int reset)|: Denote which function should be enabled for tracing by its address. \cpp|remove| should be \cpp|0| when adding a trace, and \cpp|1| when removing one. Note that \cpp|ip| must be the address where the call to \cpp|__fentry__| is located in the function.
\end{itemize}

Alright let's write some code.
Below is the source code of the example from above, but rewritten using \verb|ftrace|.
The main difference is the \cpp|install_hook| function,
which prepares our tracee function (\cpp|sys_openat|),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't separate the line with comma.

and the callback function (\cpp|ftrace_thunk|).
We need both \cpp|FTRACE_OPS_FL_SAVE_REGS| and \cpp|FTRACE_OPS_FL_IPMODIFY| because we're modifying \cpp|ip|.
Inside \cpp|ftrace_thunk| is what the magic happens.
We check if it is called from within the module,
if not then it modifies the instruction pointer to our ``spying'' function.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto.
Don't separate the line with comma.

The check is performed by checking whether \cpp|parent_ip| is within this module.
During the first call, \cpp|parent_ip| points to somewhere within the kernel,
while during the second call it points to somewhere in our ``spying'' function, which is within the module.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto, the comma.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't point out all of the cases. Please check again.


When inserting the module, you should provide the uid you want to spy on as an parameter.
For example, you can spy on yourself by \verb|sudo insmod syscall-ftrace.ko uid=$UID|.

\samplec{examples/syscall-ftrace.c}
jserv marked this conversation as resolved.
Show resolved Hide resolved

\section{Blocking Processes and threads}
\label{sec:blocking_process_thread}
\subsection{Sleep}
Expand Down